Skip to content

Commit e156941

Browse files
committed
arabic tokenizer
1 parent b87f7db commit e156941

File tree

3 files changed

+81
-2
lines changed

3 files changed

+81
-2
lines changed

test_tokenizers.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66

77
#print(tokenizer.words(s))
88

9-
9+
"""
1010
s = open_read("test_data/fi_text.txt").read()
1111
for se in tokenizer.tokenize(s):
1212
print(se)
1313
14+
"""
15+
print(tokenizer.tokenize_arabic("ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين"))
16+

uralicNLP/string_processing.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,21 @@ def filter_arabic(text, keep_vowels=True, combine_by=""):
1919
return combine_by.join(re.findall(r"[ء-ي]+", text))
2020

2121
def iso_to_name(iso):
22-
return isos[iso]
22+
return isos[iso]
23+
24+
arabic_diacritics = re.compile("""
25+
ّ | # Tashdid
26+
َ | # Fatha
27+
ً | # Tanwin Fath
28+
ُ | # Damma
29+
ٌ | # Tanwin Damm
30+
ِ | # Kasra
31+
ٍ | # Tanwin Kasr
32+
ْ | # Sukun
33+
ـ # Tatwil/Kashida
34+
""", re.VERBOSE)
35+
36+
37+
def remove_arabic_diacritics(text):
38+
text = re.sub(arabic_diacritics, '', text)
39+
return text

uralicNLP/tokenizer.py

+59
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
import re
22
import mikatools
3+
from . import uralicApi
4+
from . import string_processing
5+
6+
preps = list(set("كبولف"))
7+
al = "ال"
38

49
sentence_end = set("!?。……‥!?。⋯…؟჻!…")
510
word_end_puct = set(",;:”’'\"»」)]}،؛》』〕⦆〉》】〗〙〛–—")
@@ -13,6 +18,60 @@
1318
def _ends_in_abrv(text):
1419
return abrv_regex.search(text.lower()) is not None
1520

21+
def _remove_preps(word):
22+
for p in preps:
23+
if word.startswith(p):
24+
word = word[1:]
25+
return word, p
26+
return word, None
27+
28+
def tokenize_arabic(text):
29+
toksut = sentences(text)
30+
#print(toksut)
31+
output = []
32+
for sentence in toksut:
33+
o_sentence = []
34+
for w in words(sentence):
35+
#print(o_sentence)
36+
#print(w)
37+
word = w
38+
lemmas = uralicApi.lemmatize(word, "ara",word_boundaries=True)
39+
preppu = None
40+
alli = None
41+
high_count = 0
42+
long_len = 0
43+
stop =False
44+
for l in lemmas:
45+
if "|" in l:
46+
c = l.count("|")
47+
if c > high_count:
48+
high_count = c
49+
long_len = len(l)
50+
stop = l
51+
elif high_count == c:
52+
if len(l) > long_len:
53+
stop = l
54+
long_len = len(l)
55+
if high_count > 0:
56+
o_sentence.extend([x for x in stop.split("|") if len(string_processing.remove_arabic_diacritics(x)) >0])
57+
continue
58+
59+
if len(lemmas) == 0:
60+
word, preppu = _remove_preps(word)
61+
lemmas = uralicApi.lemmatize(word, "ara")
62+
if len(lemmas) == 0 and word.startswith(al):
63+
word = word[2:]
64+
alli = al
65+
lemmas = uralicApi.lemmatize(word, "ara")
66+
if len(lemmas) == 0:
67+
o_sentence.append(w)
68+
else:
69+
parttusan = [preppu, alli, lemmas[0]]
70+
#print(parttusan)
71+
o_sentence.extend([x for x in parttusan if x is not None])
72+
output.append(o_sentence)
73+
return output
74+
1675
def sentences(text):
1776
parts = []
1877
current_s = ""

0 commit comments

Comments
 (0)