Skip to content

Commit fb788e6

Browse files
author
Thomas Proisl
committed
Correctly tokenize URLs in angle brackets (#27)
1 parent cb1d001 commit fb788e6

File tree

4 files changed

+10
-3
lines changed

4 files changed

+10
-3
lines changed

CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# CHANGELOG #
22

3+
## Version 2.4.1, 2024-02-09 ##
4+
5+
- Fix issue #27 (URLs in angle brackets).
6+
37
## Version 2.4.0, 2023-12-23 ##
48

59
- New feature: SoMaJo can output character offsets for tokens,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# a new release
1212
[project]
1313
name = "SoMaJo"
14-
version = "2.4.0"
14+
version = "2.4.1"
1515
description = "A tokenizer and sentence splitter for German and English web and social media texts."
1616
readme = "README.md"
1717
requires-python = ">=3.8"

src/somajo/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
8888
self.email = re.compile(r"\b[\w.%+-]+(?:@| \[at\] )[\w.-]+(?:\.| \[?dot\]? )\p{L}{2,}\b")
8989
# simple regex for urls that start with http or www
9090
# no square brackets and spaces in URL: [^][ ]
91-
self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+?\(\S*?\)[^][ ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
92-
self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][ ]+[^][\'. "!?,;:()]', re.IGNORECASE)
91+
self.simple_url_with_brackets = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+?\(\S*?\)[^][<> ]*(?=$|[\'. "!?,;])', re.IGNORECASE)
92+
self.simple_url = re.compile(r'\b(?:(?:https?|ftp|svn)://|(?:https?://)?www\.)[^][<> ]+[^][<>\'. "!?,;:()]', re.IGNORECASE)
9393
self.doi = re.compile(r'\bdoi:10\.\d+/\S+', re.IGNORECASE)
9494
self.doi_with_space = re.compile(r'(?<=\bdoi: )10\.\d+/\S+', re.IGNORECASE)
9595
# regex for ISBNs adapted from:

tests/test_tokenizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,9 @@ def test_emails_urls_26(self):
10401040
def test_emails_urls_27(self):
10411041
self._equal("link: [Linktext „viel“ Text](https://other_link.com).", "link : [ Linktext „ viel “ Text ] ( https://other_link.com ) .")
10421042

1043+
def test_emails_urls_28(self):
1044+
self._equal("link: <https://one_link.com>.", "link : < https://one_link.com > .")
1045+
10431046

10441047
class TestAbbreviations(TestTokenizer):
10451048
def test_abbreviations_01(self):

0 commit comments

Comments
 (0)