diff --git a/.gitignore b/.gitignore index 7d1c1e1..46b2e6c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ z_sandbox/ # databases app/*.sqlite +app/klang/data_dev +app/klang/data_test .idea/ *.pyc diff --git a/app/klang/controller.py b/app/klang/controller.py index 543c4e2..69bb1aa 100644 --- a/app/klang/controller.py +++ b/app/klang/controller.py @@ -21,7 +21,8 @@ def get(self) : class ConllNameServiceResource(Resource): "ConllService" - def get(self, conll_name) : + def get(self, conll_name): + conll_string = ConllService.get_by_name(conll_name) sentences_string = ConllService.seperate_conll_sentences(conll_string) sentences_audio_token = [] diff --git a/app/klang/data_test/John_Doe/John_Doe.intervals.conll b/app/klang/data_test/John_Doe/John_Doe.intervals.conll index ad7c2ad..b56309e 100644 --- a/app/klang/data_test/John_Doe/John_Doe.intervals.conll +++ b/app/klang/data_test/John_Doe/John_Doe.intervals.conll @@ -1,9 +1,5 @@ # sent_id = John_Doe.intervals.conll__1 -# text = it is a test conll -# sound_url = Veronique_Jezewski.wav -1 it it _ _ _ _ _ _ AlignBegin=162330|AlignEnd=162590 -2 is is _ _ _ _ _ _ AlignBegin=162590|AlignEnd=162849 -3 a a _ _ _ _ _ _ AlignBegin=162849|AlignEnd=163109 -4 test test _ _ _ _ _ _ AlignBegin=163368|AlignEnd=163628 -5 que que _ _ _ _ _ _ AlignBegin=163628|AlignEnd=163887 -6 conll conll _ _ _ _ _ _ AlignBegin=163887|AlignEnd=164147 +# text = it is +# sound_url = John_Doe.wav +1 it it _ _ _ _ _ _ AlignBegin=100|AlignEnd=500 +2 is is _ _ _ _ _ _ AlignBegin=600|AlignEnd=1000 diff --git a/app/klang/service.py b/app/klang/service.py index ae2a84a..9978a63 100644 --- a/app/klang/service.py +++ b/app/klang/service.py @@ -8,6 +8,7 @@ r"^\d+\t(.+?)\t.*AlignBegin=(\d+).*AlignEnd=(\d+)" ) + class ConllService: @staticmethod def get_path_data(): @@ -45,11 +46,27 @@ def seperate_conll_sentences(conll_string: str) -> List[str]: @staticmethod def sentence_to_audio_tokens(sentence_string: str): - audio_tokens = [] + audio_tokens = {} for line in sentence_string.split("\n"): if line: if not line.startswith("#"): m = align_begin_and_end_regex.search(line) - audio_tokens += [(m.group(1), int(m.group(2)), int(m.group(3)))] - + audio_token = { + "token": m.group(1), + "alignBegin": int(m.group(2)), + "alignEnd": int(m.group(3)), + } + audio_tokens[int(line.split("\t")[0])] = audio_token + + print(audio_tokens) return audio_tokens + + @staticmethod + def process_sentences_audio_token(conll_name: str): + conll_string = ConllService.get_by_name(conll_name) + sentences_string = ConllService.seperate_conll_sentences(conll_string) + sentences_audio_token = [] + for sentence_string in sentences_string: + audio_tokens = ConllService.sentence_to_audio_tokens(sentence_string) + sentences_audio_token.append(audio_tokens) + return sentences_audio_token diff --git a/app/klang/service_test.py b/app/klang/service_test.py index cf98b4e..843934e 100644 --- a/app/klang/service_test.py +++ b/app/klang/service_test.py @@ -13,32 +13,46 @@ def test_get_path_data(): print("KK path_data", path_data) assert os.path.isdir(path_data) + def test_get_path_conll(): path_conll = ConllService.get_path_conll(file_name) assert os.path.isfile(path_conll) + def test_read_conll(): path_conll = ConllService.get_path_conll(file_name) conll_string = ConllService.read_conll(path_conll) assert type(conll_string) == str + def test_get_all(): conlls = ConllService.get_all() assert conlls == ["John_Doe"] + def test_get_by_name(): path_conll = ConllService.get_path_conll(file_name) conll_string = ConllService.read_conll(path_conll) assert conll_string == ConllService.get_by_name(file_name) + def test_seperate_conll_sentences(): conll_string = "# sent_id = test_sentence_1\n1\ttest_token\ntest_lemma\ntest_upos\n\n# sent_id = test_sentence_2\n1\ttest_token\ntest_lemma\ntest_upos\n\n" sentences = ConllService.seperate_conll_sentences(conll_string) assert len(sentences) == 2 + def test_sentence_to_audio_tokens(): sentence = "# sent_id = test_sentence_1\n1\tdonc\tdonc\t_\t_\t_\t_\t_\t_\tAlignBegin=0|AlignEnd=454" audio_tokens = ConllService.sentence_to_audio_tokens(sentence) - assert audio_tokens[0][0] == "donc" - assert audio_tokens[0][1] == 0 - assert audio_tokens[0][2] == 454 + assert audio_tokens[1]["token"] == "donc" + assert audio_tokens[1]["alignBegin"] == 0 + assert audio_tokens[1]["alignEnd"] == 454 + + +def test_process_sentences_audio_token(): + sentences_audio_token = ConllService.process_sentences_audio_token(file_name) + assert sentences_audio_token == [{ + 1: {"token": "it", "alignBegin": 100, "alignEnd": 500}, + 2: {"token": "is", "alignBegin": 600, "alignEnd": 1000} + }]