From c98ca9b78391c852b14118fc0510f63b9aa3efb5 Mon Sep 17 00:00:00 2001 From: Pravendra Singh Date: Tue, 4 Mar 2014 21:59:19 +0530 Subject: [PATCH 1/2] fixes language detection when there are mixing of more then one language --- silpa_common/langdetect.py | 135 +++++++++++++++++++++++++++++++------ 1 file changed, 114 insertions(+), 21 deletions(-) diff --git a/silpa_common/langdetect.py b/silpa_common/langdetect.py index c129300..bbe79ad 100644 --- a/silpa_common/langdetect.py +++ b/silpa_common/langdetect.py @@ -20,6 +20,22 @@ import string +def check(old_lang_code,new_lang_code): + """ + this function helps to make sure that every letter + of a word have same language. + if language codes for every letter are same + then it returns True. + """ + + if(old_lang_code == ''): + return True + else: + if(old_lang_code != new_lang_code): + return False + else: + return True + def detect_lang(text): """ @@ -27,6 +43,7 @@ def detect_lang(text): This function can take a chunk of text and return a dictionary containing word-language key-value pairs. """ + words = text.split(" ") word_count = len(words) word_iter = 0 @@ -41,6 +58,11 @@ def detect_lang(text): word = word.replace(punct, " ") length = len(word) index = 0 + + # detected language code, initially blank + # one argument for `function : check()` + lang_code = '' + # scan left to write, skip any punctuations, # the detection stops in the first match itself. while index < length: @@ -48,36 +70,107 @@ def detect_lang(text): if not letter.isalpha(): index = index + 1 continue + if ((ord(letter) >= 0x0D00) & (ord(letter) <= 0x0D7F)): - result_dict[orig_word] = "ml_IN" - break + if(check(lang_code,"ml_IN")): + result_dict[orig_word] = "ml_IN" + lang_code = "ml_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0980) & (ord(letter) <= 0x09FF)): - result_dict[orig_word] = "bn_IN" - break + if(check(lang_code,"bn_IN")): + result_dict[orig_word] = "bn_IN" + lang_code = "bn_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0900) & (ord(letter) <= 0x097F)): - result_dict[orig_word] = "hi_IN" - break + if(check(lang_code,"hi_IN")): + result_dict[orig_word] = "hi_IN" + lang_code = "hi_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0A80) & (ord(letter) <= 0x0AFF)): - result_dict[orig_word] = "gu_IN" - break + if(check(lang_code,"gu_IN")): + result_dict[orig_word] = "gu_IN" + lang_code = "gu_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0A00) & (ord(letter) <= 0x0A7F)): - result_dict[orig_word] = "pa_IN" - break + if(check(lang_code,"pa_IN")): + result_dict[orig_word] = "pa_IN" + lang_code = "pa_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0C80) & (ord(letter) <= 0x0CFF)): - result_dict[orig_word] = "kn_IN" - break + if(check(lang_code,"kn_IN")): + result_dict[orig_word] = "kn_IN" + lang_code = "kn_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0B00) & (ord(letter) <= 0x0B7F)): - result_dict[orig_word] = "or_IN" - break + if(check(lang_code,"or_IN")): + result_dict[orig_word] = "or_IN" + lang_code = "or_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0B80) & (ord(letter) <= 0x0BFF)): - result_dict[orig_word] = "ta_IN" - break + if(check(lang_code,"ta_IN")): + result_dict[orig_word] = "ta_IN" + lang_code = "ta_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((ord(letter) >= 0x0C00) & (ord(letter) <= 0x0C7F)): - result_dict[orig_word] = "te_IN" - break + if(check(lang_code,"te_IN")): + result_dict[orig_word] = "te_IN" + lang_code = "te_IN" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + if ((letter <= u'z')): # this is fallback case. - result_dict[orig_word] = "en_US" - break + if(check(lang_code,"en_US")): + result_dict[orig_word] = "en_US" + lang_code = "en_US" + index = index + 1 + continue + else: + result_dict[orig_word] = "mixing of more then one language found" + break + index = index + 1 word_iter = word_iter + 1 - return result_dict + return result_dict \ No newline at end of file From 456f565e7330d124ca612c30fe21bcb249ef3ab3 Mon Sep 17 00:00:00 2001 From: Pravendra Singh Date: Tue, 4 Mar 2014 23:47:39 +0530 Subject: [PATCH 2/2] made changes to make python into pep8 style --- silpa_common/langdetect.py | 51 ++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/silpa_common/langdetect.py b/silpa_common/langdetect.py index bbe79ad..da8d834 100644 --- a/silpa_common/langdetect.py +++ b/silpa_common/langdetect.py @@ -20,7 +20,8 @@ import string -def check(old_lang_code,new_lang_code): + +def check(old_lang_code, new_lang_code): """ this function helps to make sure that every letter of a word have same language. @@ -36,6 +37,12 @@ def check(old_lang_code,new_lang_code): else: return True +""" +error when word contains letters from +more then one languages +""" +mix_error_line = "mixing of more then one language found" + def detect_lang(text): """ @@ -72,105 +79,105 @@ def detect_lang(text): continue if ((ord(letter) >= 0x0D00) & (ord(letter) <= 0x0D7F)): - if(check(lang_code,"ml_IN")): + if(check(lang_code, "ml_IN")): result_dict[orig_word] = "ml_IN" lang_code = "ml_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0980) & (ord(letter) <= 0x09FF)): - if(check(lang_code,"bn_IN")): + if(check(lang_code, "bn_IN")): result_dict[orig_word] = "bn_IN" lang_code = "bn_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0900) & (ord(letter) <= 0x097F)): - if(check(lang_code,"hi_IN")): + if(check(lang_code, "hi_IN")): result_dict[orig_word] = "hi_IN" lang_code = "hi_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0A80) & (ord(letter) <= 0x0AFF)): - if(check(lang_code,"gu_IN")): + if(check(lang_code, "gu_IN")): result_dict[orig_word] = "gu_IN" lang_code = "gu_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0A00) & (ord(letter) <= 0x0A7F)): - if(check(lang_code,"pa_IN")): + if(check(lang_code, "pa_IN")): result_dict[orig_word] = "pa_IN" lang_code = "pa_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0C80) & (ord(letter) <= 0x0CFF)): - if(check(lang_code,"kn_IN")): + if(check(lang_code, "kn_IN")): result_dict[orig_word] = "kn_IN" lang_code = "kn_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0B00) & (ord(letter) <= 0x0B7F)): - if(check(lang_code,"or_IN")): + if(check(lang_code, "or_IN")): result_dict[orig_word] = "or_IN" lang_code = "or_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0B80) & (ord(letter) <= 0x0BFF)): - if(check(lang_code,"ta_IN")): + if(check(lang_code, "ta_IN")): result_dict[orig_word] = "ta_IN" lang_code = "ta_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((ord(letter) >= 0x0C00) & (ord(letter) <= 0x0C7F)): - if(check(lang_code,"te_IN")): + if(check(lang_code, "te_IN")): result_dict[orig_word] = "te_IN" lang_code = "te_IN" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break if ((letter <= u'z')): # this is fallback case. - if(check(lang_code,"en_US")): + if(check(lang_code, "en_US")): result_dict[orig_word] = "en_US" lang_code = "en_US" index = index + 1 continue else: - result_dict[orig_word] = "mixing of more then one language found" + result_dict[orig_word] = mix_error_line break index = index + 1 word_iter = word_iter + 1 - return result_dict \ No newline at end of file + return result_dict