arabic tokenizer

mikahama · mikahama · commit e15694186831 · 2022-03-26T18:29:36.000+01:00
diff --git a/test_tokenizers.py b/test_tokenizers.py
@@ -6,8 +6,11 @@
 
 #print(tokenizer.words(s))
 
-
+"""
 s = open_read("test_data/fi_text.txt").read()
 for se in tokenizer.tokenize(s):
 	print(se)
 
+"""
+print(tokenizer.tokenize_arabic("ومن الناس من يقول آمنا بالله وباليوم الآخر وما هم بمؤمنين"))
+
diff --git a/uralicNLP/string_processing.py b/uralicNLP/string_processing.py
@@ -19,4 +19,21 @@ def filter_arabic(text, keep_vowels=True, combine_by=""):
 		return combine_by.join(re.findall(r"[ء-ي]+", text))
 		
 def iso_to_name(iso):
-    return isos[iso]
+    return isos[iso]
+
+arabic_diacritics = re.compile("""
+                             ّ    | # Tashdid
+                             َ    | # Fatha
+                             ً    | # Tanwin Fath
+                             ُ    | # Damma
+                             ٌ    | # Tanwin Damm
+                             ِ    | # Kasra
+                             ٍ    | # Tanwin Kasr
+                             ْ    | # Sukun
+                             ـ     # Tatwil/Kashida
+                         """, re.VERBOSE)
+
+
+def remove_arabic_diacritics(text):
+    text = re.sub(arabic_diacritics, '', text)
+    return text
diff --git a/uralicNLP/tokenizer.py b/uralicNLP/tokenizer.py
@@ -1,5 +1,10 @@
 import re
 import mikatools
+from . import uralicApi
+from . import string_processing
+
+preps = list(set("كبولف"))
+al = "ال"
 
 sentence_end = set("!?。……‥！？。⋯…؟჻!…")
 word_end_puct = set(",;:”’'\"»」)]}،؛》』〕｠〉》】〗〙〛–—")
@@ -13,6 +18,60 @@
 def _ends_in_abrv(text):
 	return abrv_regex.search(text.lower()) is not None
 
+def _remove_preps(word):
+	for p in preps:
+		if word.startswith(p):
+			word = word[1:]
+			return word, p
+	return word, None
+
+def tokenize_arabic(text):
+	toksut = sentences(text)
+	#print(toksut)
+	output = []
+	for sentence in toksut:
+		o_sentence = []
+		for w in words(sentence):
+			#print(o_sentence)
+			#print(w)
+			word = w
+			lemmas = uralicApi.lemmatize(word, "ara",word_boundaries=True)
+			preppu = None
+			alli = None
+			high_count = 0
+			long_len = 0
+			stop =False
+			for l in lemmas:
+				if "|" in l:
+					c = l.count("|")
+					if c > high_count:
+						high_count = c
+						long_len = len(l)
+						stop = l
+					elif high_count == c:
+						if len(l) > long_len:
+							stop = l
+							long_len = len(l)
+			if high_count > 0:
+				o_sentence.extend([x for x in stop.split("|") if len(string_processing.remove_arabic_diacritics(x)) >0])
+				continue
+
+			if len(lemmas) == 0:
+				word, preppu = _remove_preps(word)
+				lemmas = uralicApi.lemmatize(word, "ara")
+			if len(lemmas) == 0 and word.startswith(al):
+				word = word[2:]
+				alli = al
+				lemmas = uralicApi.lemmatize(word, "ara")
+			if len(lemmas) == 0:
+				o_sentence.append(w)
+			else:
+				parttusan = [preppu, alli, lemmas[0]]
+				#print(parttusan)
+				o_sentence.extend([x for x in parttusan if x is not None])
+		output.append(o_sentence)
+	return output
+
 def sentences(text):
 	parts = []
 	current_s = ""