OdiaNLP · soumendrak · Aug 2, 2020
diff --git a/controller.py b/controller.py
@@ -19,8 +19,13 @@
 app = Flask(__name__)
 
 
-@app.route('/translate', methods=['GET', 'POST'])
+@app.route('/', methods=['GET'])
 def index():
+    return "It's working! :) "
+
+
+@app.route('/translate', methods=['GET', 'POST'])
+def translate():
     form = InputForm(request.form)
 
     if request.method == 'POST' and form.validate():

diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ WTForms==2.2.1
 dill==0.3.0
 nltk==3.4.5
 Flask==1.1.1
+black==19.10b0
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/smt_utils.py b/src/smt_utils.py
@@ -1,32 +1,23 @@
 import nltk
 from nltk.translate import AlignedSent
-from nltk.translate.ibm2 import (
-    IBMModel2,
-    Model2Counts
-)
+from nltk.translate.ibm2 import IBMModel2, Model2Counts
 from tqdm import tqdm
 
 
 class IBMModel2WithProgressbar(IBMModel2):
-    def __init__(
-            self,
-            sentence_aligned_corpus,
-            iterations,
-            probability_tables=None
-    ):
+    def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
         """
         IBM Model 2 with progress bar for training
         """
         super(IBMModel2WithProgressbar, self).__init__(
-            sentence_aligned_corpus,
-            iterations, probability_tables
+            sentence_aligned_corpus, iterations, probability_tables
         )
 
     def train(self, parallel_corpus):
         counts = Model2Counts()
-        for aligned_sentence in tqdm(parallel_corpus, unit=' samples'):
+        for aligned_sentence in tqdm(parallel_corpus, unit=" samples"):
             src_sentence = [None] + aligned_sentence.mots
-            trg_sentence = ['UNUSED'] + aligned_sentence.words  # 1-indexed
+            trg_sentence = ["UNUSED"] + aligned_sentence.words  # 1-indexed
             l = len(aligned_sentence.mots)
             m = len(aligned_sentence.words)
 
@@ -49,7 +40,7 @@ def train(self, parallel_corpus):
         self.maximize_alignment_probabilities(counts)
 
 
-def train_ibmmodel2(src_text, trg_text, iterations=5):
+def train_ibmmodel2(src_text, trg_text, iterations=1):
     """
     train IBM model 2
     :param src_text: (list) src text
@@ -73,11 +64,7 @@ def translate(ibm_model, src_tokens):
         probs = ibm_model.translation_table[tok]
         if len(probs) == 0:
             continue
-        sorted_words = sorted(
-            [(k, v) for k, v in probs.items()],
-            key=lambda x: x[1],
-            reverse=True
-        )
+        sorted_words = sorted([(k, v) for k, v in probs.items()], key=lambda x: x[1], reverse=True)
         top_token = sorted_words[1][0]
         if top_token is not None:
             translation_tokens.append(top_token)
@@ -94,4 +81,4 @@ def tokenize_od(sent):
 
 
 def detokenize_od(toks):
-    return ' '.join(toks)
+    return " ".join(toks)
diff --git a/src/train.py b/src/train.py
@@ -3,28 +3,21 @@
 import dill as pickle
 from tqdm import tqdm
 
-from smt_utils import (
-    train_ibmmodel2,
-    translate,
-    tokenize_en,
-    tokenize_od,
-    detokenize_od
-)
+from smt_utils import detokenize_od, tokenize_en, tokenize_od, train_ibmmodel2, translate
 
 if __name__ == "__main__":
-    data_dir = os.path.join(
-        '../data',
-        '01_01_2020'
-    )
+    data_dir = os.path.join("../data", "01_01_2020")
+    print(f"Data directory: {data_dir}")
 
     filepaths = {
-        'train_en': os.path.join(data_dir, 'train.en'),
-        'train_od': os.path.join(data_dir, 'train.od'),
-        'val_en': os.path.join(data_dir, 'val.en'),
-        'val_od': os.path.join(data_dir, 'val.od'),
-        'test_en': os.path.join(data_dir, 'test.en'),
-        'test_od': os.path.join(data_dir, 'test.od'),
+        "train_en": os.path.join(data_dir, "train.en"),
+        "train_od": os.path.join(data_dir, "train.od"),
+        "val_en": os.path.join(data_dir, "val.en"),
+        "val_od": os.path.join(data_dir, "val.od"),
+        "test_en": os.path.join(data_dir, "test.en"),
+        "test_od": os.path.join(data_dir, "test.od"),
     }
+    print(f"File-paths: {filepaths}")
 
     for data_type in filepaths:
         if not os.path.isfile(filepaths[data_type]):
@@ -33,72 +26,55 @@
     text = {}
     for data_type in filepaths:
         filepath = filepaths[data_type]
-        with open(filepath, 'r', encoding='utf-8') as f:
-            text[data_type] = list(
-                map(str.strip, f.readlines())
-            )
+        with open(filepath, "r", encoding="utf-8") as f:
+            text[data_type] = list(map(str.strip, f.readlines()))
 
-    if \
-            (len(text['train_en']) != len(text['train_od'])) or \
-                    (len(text['val_en']) != len(text['val_od'])) or \
-                    (len(text['test_en']) != len(text['test_od'])):
+    if (
+        (len(text["train_en"]) != len(text["train_od"]))
+        or (len(text["val_en"]) != len(text["val_od"]))
+        or (len(text["test_en"]) != len(text["test_od"]))
+    ):
+        print("Length count mismatched between the data types")
         raise AssertionError
 
     text_tokenized = {
-        'train_en': [tokenize_en(sent) for sent in text['train_en']],
-        'train_od': [tokenize_od(sent) for sent in text['train_od']],
-        'val_en': [tokenize_en(sent) for sent in text['val_en']],
-        'test_en': [tokenize_en(sent) for sent in text['test_en']],
+        "train_en": [tokenize_en(sent) for sent in text["train_en"]],
+        "train_od": [tokenize_od(sent) for sent in text["train_od"]],
+        "val_en": [tokenize_en(sent) for sent in text["val_en"]],
+        "test_en": [tokenize_en(sent) for sent in text["test_en"]],
     }
 
     # train IBM model 2
+    print("IBM model training started..")
     ibm_model = train_ibmmodel2(
-        src_text=text_tokenized['train_en'],
-        trg_text=text_tokenized['train_od'],
-        iterations=5
+        src_text=text_tokenized["train_en"], trg_text=text_tokenized["train_od"], iterations=5
     )
+    print("IBM model training completed.")
 
     # dump trained model
-    with open(os.path.join('../models', 'model.pkl'), 'wb') as f:
+    os.makedirs("../models", exist_ok=True)
+    with open(os.path.join("../models", "model.pkl"), "wb") as f:
         pickle.dump(ibm_model, f)
+    print("Models dumped")
 
     # load model from file
-    with open(os.path.join('../models', 'model.pkl'), 'rb') as f:
+    with open(os.path.join("../models", "model.pkl"), "rb") as f:
         ibm_model_loaded = pickle.load(f)
 
     # translate
-    translations = {
-        'train': [],
-        'val': [],
-        'test': []
-    }
+    translations = {"train": [], "val": [], "test": []}
 
     for data_type in translations.keys():
-        for toks in tqdm(text_tokenized[data_type + '_en']):
-            translation_toks = translate(
-                ibm_model=ibm_model_loaded,
-                src_tokens=toks
-            )
+        print(f"Translation process started for data type: {data_type}")
+        for toks in tqdm(text_tokenized[data_type + "_en"]):
+            translation_toks = translate(ibm_model=ibm_model_loaded, src_tokens=toks)
             translation = detokenize_od(translation_toks)
             translations[data_type].append(translation)
 
     # write translations to files
-    translation_filenames = {
-        'train': 'train.out.od',
-        'val': 'val.out.od',
-        'test': 'test.out.od'
-    }
+    translation_filenames = {"train": "train.out.od", "val": "val.out.od", "test": "test.out.od"}
     for data_type in translation_filenames:
         with open(
-                os.path.join(
-                    '../', translation_filenames[data_type]
-                ),
-                'w',
-                encoding='utf-8'
+            os.path.join("../", translation_filenames[data_type]), "w", encoding="utf-8"
         ) as f:
-            f.writelines(
-                list(map(
-                    lambda x: x + '\n',
-                    translations[data_type]
-                ))
-            )
+            f.writelines(list(map(lambda x: x + "\n", translations[data_type])))