From d3d0d54731fe5444ad181068b883172b5937aeac Mon Sep 17 00:00:00 2001 From: Kavya Manohar Date: Sat, 23 Mar 2024 15:32:18 +0530 Subject: [PATCH] More normalization rules and tests --- README.md | 1 + libindic/normalizer/rules/normalizer_ml.rules | 21 +++++++++++++------ libindic/normalizer/tests/test_normalizer.py | 9 +++++++- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index de2d2ad..6f78785 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Malayalam language only. - Changes combination chillus to atomic chillu characters - Normalization of vowel signs - Corrects some common typos in Malayalam (needs thorough review) +- Alternate spelling normalizations ## Installation diff --git a/libindic/normalizer/rules/normalizer_ml.rules b/libindic/normalizer/rules/normalizer_ml.rules index 89ffd03..50591e6 100755 --- a/libindic/normalizer/rules/normalizer_ml.rules +++ b/libindic/normalizer/rules/normalizer_ml.rules @@ -1,12 +1,6 @@ #This is comment $remove_punctuation=true $filter_lang=ml_IN -# Common Type Corrections -ൻറ=ന്റ -ന്‍പ=മ്പ -ററ=റ്റ -റ്‍=ർ - # Chillu normalization to atomic chillus ണ്‍=ൺ ന്‍=ൻ @@ -27,3 +21,18 @@ $filter_lang=ml_IN ഇൗ=ഈ ഉൗ=ഊ ഒൗ=ഔ + +# Common Typo Corrections +ൻറ=ന്റ +ന്‍പ=മ്പ +ററ=റ്റ +റ്‍=ർ +ദു:ഖ=ദുഃഖ +നമ:=നമഃ + +# Alternate written forms +ൎയ്യ=ര്യ #ഭാൎയ്യ, സൂൎയ്യൻ +അധ്യാപ=അദ്ധ്യാപ +ൎ=ർ +ൽപ=ല്പ + diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py index ccaffb6..0ae0a32 100644 --- a/libindic/normalizer/tests/test_normalizer.py +++ b/libindic/normalizer/tests/test_normalizer.py @@ -34,10 +34,17 @@ def test_normalize(self): # Remove punctuations self.assertEqual(normalize('1-ാം'), '1ാം') - self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാൎത്തുമ്പി') + self.assertEqual(normalize('1-ാം', keep_punctuations=True), '1-ാം') # Common Typos self.assertEqual(normalize('പൂമ്പാററ'), 'പൂമ്പാറ്റ') + self.assertEqual(normalize('ദു:ഖത്തിന്റെ'), 'ദുഃഖത്തിന്റെ') + self.assertEqual(normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True), + 'ദുഃഖത്തിന്റെ') + + # Alternate Spellings + self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി') + self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ') def test_multiline_string(self): expected = """കുഞ്ചൻ നമ്പ്യാർ