Skip to content

Commit

Permalink
Move lists of special single tokens to separate files
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Proisl committed Aug 5, 2024
1 parent a247cc6 commit 14c010c
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 15 deletions.
12 changes: 0 additions & 12 deletions src/somajo/data/single_token_abbreviations_de.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,3 @@ T.V.
Uni-Kl.
USt-IdNr.
Zeitschr.titel

# These should be moved to another file:
.Net
/rant
/s
E/E
tl;dr
zl;ng

# SAP Versions
S/4
R/3
3 changes: 0 additions & 3 deletions src/somajo/data/single_token_abbreviations_en.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,3 @@ a.m.
p.m.
P.S.
T.V.

# These should be moved to another file:
tl;dr
19 changes: 19 additions & 0 deletions src/somajo/data/single_tokens_de.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# A list of tokens that should not be split.
#
# Lines starting with “#” are treated as comments and will be ignored.

.Net
/rant
/s
E/E
tl;dr
zl;ng

# SAP Versions
S/4
R/3

# mobile telephony
3G
4G
5G
10 changes: 10 additions & 0 deletions src/somajo/data/single_tokens_en.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# A list of tokens that should not be split.
#
# Lines starting with “#” are treated as comments and will be ignored.

tl;dr

# mobile telephony
3G
4G
5G
7 changes: 7 additions & 0 deletions src/somajo/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
\#x[0-9a-f]+ # hexadecimal entities
);""", re.VERBOSE | re.IGNORECASE)

# high priority single tokens
single_token_list = utils.read_abbreviation_file(f"single_tokens_{self.language[:2]}.txt")
self.single_tokens = re.compile(r"(?<![\w.])(?:" + r'|'.join([re.escape(_) for _ in single_token_list]) + r')(?!\p{L})', re.IGNORECASE)

# EMOTICONS
emoticon_set = {"(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:",
"(-:", ")=", ")o:", ")x", ":'C", ":/", ":<",
Expand Down Expand Up @@ -698,6 +702,9 @@ def _tokenize(self, token_dll):
# XML entities
self._split_all_matches(self.entity, token_dll, "XML_entity")

# high priority single tokens
self._split_all_matches(self.single_tokens, token_dll)

# emoticons
self._split_all_matches(self.heart_emoticon, token_dll, "emoticon")
self._split_all_matches(self.emoticon, token_dll, "emoticon")
Expand Down

0 comments on commit 14c010c

Please sign in to comment.