Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@
app = Flask(__name__)


@app.route('/translate', methods=['GET', 'POST'])
@app.route('/', methods=['GET'])
def index():
return "It's working! :) "


@app.route('/translate', methods=['GET', 'POST'])
def translate():
form = InputForm(request.form)

if request.method == 'POST' and form.validate():
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ WTForms==2.2.1
dill==0.3.0
nltk==3.4.5
Flask==1.1.1
black==19.10b0
Empty file added src/__init__.py
Empty file.
29 changes: 8 additions & 21 deletions src/smt_utils.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,23 @@
import nltk
from nltk.translate import AlignedSent
from nltk.translate.ibm2 import (
IBMModel2,
Model2Counts
)
from nltk.translate.ibm2 import IBMModel2, Model2Counts
from tqdm import tqdm


class IBMModel2WithProgressbar(IBMModel2):
def __init__(
self,
sentence_aligned_corpus,
iterations,
probability_tables=None
):
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
"""
IBM Model 2 with progress bar for training
"""
super(IBMModel2WithProgressbar, self).__init__(
sentence_aligned_corpus,
iterations, probability_tables
sentence_aligned_corpus, iterations, probability_tables
)

def train(self, parallel_corpus):
counts = Model2Counts()
for aligned_sentence in tqdm(parallel_corpus, unit=' samples'):
for aligned_sentence in tqdm(parallel_corpus, unit=" samples"):
src_sentence = [None] + aligned_sentence.mots
trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)

Expand All @@ -49,7 +40,7 @@ def train(self, parallel_corpus):
self.maximize_alignment_probabilities(counts)


def train_ibmmodel2(src_text, trg_text, iterations=5):
def train_ibmmodel2(src_text, trg_text, iterations=1):
"""
train IBM model 2
:param src_text: (list) src text
Expand All @@ -73,11 +64,7 @@ def translate(ibm_model, src_tokens):
probs = ibm_model.translation_table[tok]
if len(probs) == 0:
continue
sorted_words = sorted(
[(k, v) for k, v in probs.items()],
key=lambda x: x[1],
reverse=True
)
sorted_words = sorted([(k, v) for k, v in probs.items()], key=lambda x: x[1], reverse=True)
top_token = sorted_words[1][0]
if top_token is not None:
translation_tokens.append(top_token)
Expand All @@ -94,4 +81,4 @@ def tokenize_od(sent):


def detokenize_od(toks):
return ' '.join(toks)
return " ".join(toks)
96 changes: 36 additions & 60 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,21 @@
import dill as pickle
from tqdm import tqdm

from smt_utils import (
train_ibmmodel2,
translate,
tokenize_en,
tokenize_od,
detokenize_od
)
from smt_utils import detokenize_od, tokenize_en, tokenize_od, train_ibmmodel2, translate

if __name__ == "__main__":
data_dir = os.path.join(
'../data',
'01_01_2020'
)
data_dir = os.path.join("../data", "01_01_2020")
print(f"Data directory: {data_dir}")

filepaths = {
'train_en': os.path.join(data_dir, 'train.en'),
'train_od': os.path.join(data_dir, 'train.od'),
'val_en': os.path.join(data_dir, 'val.en'),
'val_od': os.path.join(data_dir, 'val.od'),
'test_en': os.path.join(data_dir, 'test.en'),
'test_od': os.path.join(data_dir, 'test.od'),
"train_en": os.path.join(data_dir, "train.en"),
"train_od": os.path.join(data_dir, "train.od"),
"val_en": os.path.join(data_dir, "val.en"),
"val_od": os.path.join(data_dir, "val.od"),
"test_en": os.path.join(data_dir, "test.en"),
"test_od": os.path.join(data_dir, "test.od"),
}
print(f"File-paths: {filepaths}")

for data_type in filepaths:
if not os.path.isfile(filepaths[data_type]):
Expand All @@ -33,72 +26,55 @@
text = {}
for data_type in filepaths:
filepath = filepaths[data_type]
with open(filepath, 'r', encoding='utf-8') as f:
text[data_type] = list(
map(str.strip, f.readlines())
)
with open(filepath, "r", encoding="utf-8") as f:
text[data_type] = list(map(str.strip, f.readlines()))

if \
(len(text['train_en']) != len(text['train_od'])) or \
(len(text['val_en']) != len(text['val_od'])) or \
(len(text['test_en']) != len(text['test_od'])):
if (
(len(text["train_en"]) != len(text["train_od"]))
or (len(text["val_en"]) != len(text["val_od"]))
or (len(text["test_en"]) != len(text["test_od"]))
):
print("Length count mismatched between the data types")
raise AssertionError

text_tokenized = {
'train_en': [tokenize_en(sent) for sent in text['train_en']],
'train_od': [tokenize_od(sent) for sent in text['train_od']],
'val_en': [tokenize_en(sent) for sent in text['val_en']],
'test_en': [tokenize_en(sent) for sent in text['test_en']],
"train_en": [tokenize_en(sent) for sent in text["train_en"]],
"train_od": [tokenize_od(sent) for sent in text["train_od"]],
"val_en": [tokenize_en(sent) for sent in text["val_en"]],
"test_en": [tokenize_en(sent) for sent in text["test_en"]],
}

# train IBM model 2
print("IBM model training started..")
ibm_model = train_ibmmodel2(
src_text=text_tokenized['train_en'],
trg_text=text_tokenized['train_od'],
iterations=5
src_text=text_tokenized["train_en"], trg_text=text_tokenized["train_od"], iterations=5
)
print("IBM model training completed.")

# dump trained model
with open(os.path.join('../models', 'model.pkl'), 'wb') as f:
os.makedirs("../models", exist_ok=True)
with open(os.path.join("../models", "model.pkl"), "wb") as f:
pickle.dump(ibm_model, f)
print("Models dumped")

# load model from file
with open(os.path.join('../models', 'model.pkl'), 'rb') as f:
with open(os.path.join("../models", "model.pkl"), "rb") as f:
ibm_model_loaded = pickle.load(f)

# translate
translations = {
'train': [],
'val': [],
'test': []
}
translations = {"train": [], "val": [], "test": []}

for data_type in translations.keys():
for toks in tqdm(text_tokenized[data_type + '_en']):
translation_toks = translate(
ibm_model=ibm_model_loaded,
src_tokens=toks
)
print(f"Translation process started for data type: {data_type}")
for toks in tqdm(text_tokenized[data_type + "_en"]):
translation_toks = translate(ibm_model=ibm_model_loaded, src_tokens=toks)
translation = detokenize_od(translation_toks)
translations[data_type].append(translation)

# write translations to files
translation_filenames = {
'train': 'train.out.od',
'val': 'val.out.od',
'test': 'test.out.od'
}
translation_filenames = {"train": "train.out.od", "val": "val.out.od", "test": "test.out.od"}
for data_type in translation_filenames:
with open(
os.path.join(
'../', translation_filenames[data_type]
),
'w',
encoding='utf-8'
os.path.join("../", translation_filenames[data_type]), "w", encoding="utf-8"
) as f:
f.writelines(
list(map(
lambda x: x + '\n',
translations[data_type]
))
)
f.writelines(list(map(lambda x: x + "\n", translations[data_type])))