Skip to content

Commit

Permalink
Improve recognition of heart emoticon
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Mar 5, 2021
1 parent e671f7d commit 87ccd39
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
12 changes: 12 additions & 0 deletions somajo/test/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,18 @@ def test_own_122(self):
def test_own_123(self):
self._equal("Ooh, wie süüß <3!", "Ooh , wie süüß <3 !")

def test_own_123a(self):
self._equal("Ooh, wie süüß <3 <3 <3!", "Ooh , wie süüß <3 <3 <3 !")

def test_own_123b(self):
self._equal("Ooh, wie süüß <3<3<3!", "Ooh , wie süüß <3 <3 <3 !")

def test_own_123c(self):
self._equal("Es gilt 2<3!", "Es gilt 2 < 3 !")

def test_own_123d(self):
self._equal("Das kostet <300", "Das kostet < 300")

def test_own_124(self):
self._equal("Was gibt 7x4?", "Was gibt 7 x 4 ?")

Expand Down
11 changes: 6 additions & 5 deletions somajo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,12 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
""", re.VERBOSE)
# ^3 is an emoticon, unless it is preceded by a number (with
# optional whitespace between number and ^3)
# ^\^3 # beginning of line, no leading characters
# ^\D^3 # beginning of line, one leading character
# (?<=\D[ ])^3 # two leading characters, non-number + space
# (?<=.[^\d ])^3 # two leading characters, x + non-space-non-number
self.heart_emoticon = re.compile(r"(?:^|^\D|(?<=\D[ ])|(?<=.[^\d ]))[<^]3")
# ^\^3 # beginning of line, no leading characters
# ^\D\^3 # beginning of line, one leading character
# (?<=\D[ ])\^3 # two leading characters, non-number + space
# (?<=.[^\d ])\^3 # two leading characters, x + non-space-non-number
# (?<=[<^]3[ ]?)\^3 # leading heart with optional space
self.heart_emoticon = re.compile(r"(?:^|^\D|(?<=\D[ ])|(?<=.[^\d ])|(?<=[<^]3[ ]?))[<^]3(?!\d)")
# U+2600..U+26FF Miscellaneous Symbols
# U+2700..U+27BF Dingbats
# U+FE0E..U+FE0F text and emoji variation selectors
Expand Down

0 comments on commit 87ccd39

Please sign in to comment.