diff --git a/expats/feature/text_basics.py b/expats/feature/text_basics.py index 5f82a31..f6967c9 100644 --- a/expats/feature/text_basics.py +++ b/expats/feature/text_basics.py @@ -14,6 +14,12 @@ def extract(self, _input): return np.array([len(_input)]) +@Feature.register +class NumberOfTokenPerSentFeature(Feature[spacy.tokens.doc.Doc]): + def extract(self, _input): + return np.array([len(_input) / len(list(_input.sents))]) + + @Feature.register class AverageTokenLengthFeature(Feature[spacy.tokens.doc.Doc]): def extract(self, _input): diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c60680b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,8 @@ + +import spacy +import pytest + + +@pytest.fixture(scope="session") +def spacy_en(): + yield spacy.load("en_core_web_sm") diff --git a/tests/feature/test_text_basics.py b/tests/feature/test_text_basics.py index 4d7d9da..0ed6c1c 100644 --- a/tests/feature/test_text_basics.py +++ b/tests/feature/test_text_basics.py @@ -1,53 +1,60 @@ -from typing import List import numpy as np -import spacy import pytest from expats.feature.text_basics import ( NumberOfTokenFeature, + NumberOfTokenPerSentFeature, AverageTokenLengthFeature, UnigramLikelihoodFeature, ) -def _create_spacy_doc(words: List[str]) -> spacy.tokens.doc.Doc: - return spacy.tokens.doc.Doc(spacy.vocab.Vocab(), words=words) +@pytest.mark.parametrize( + "text, expected_value", + [ + ("i am here", 3), + ] +) +def test_number_of_token_feature(spacy_en, text, expected_value): + doc = spacy_en(text) + feature = NumberOfTokenFeature() + np.testing.assert_array_equal(feature.extract(doc), np.array([expected_value])) @pytest.mark.parametrize( - "words, expected_value", + "text, expected_value", [ - (["i", "am", "here"], 3), + ("This is foo. That is foo bar.", (9 / 2)), ] ) -def test_number_of_token_feature(words, expected_value): - doc = _create_spacy_doc(words) - feature = NumberOfTokenFeature() +def test_number_of_token_per_sent_feature(spacy_en, text, expected_value): + doc = spacy_en(text) + feature = NumberOfTokenPerSentFeature() np.testing.assert_array_equal(feature.extract(doc), np.array([expected_value])) @pytest.mark.parametrize( - "words, expected_value", + "text, expected_value", [ - (["i", "am", "here"], 7 / 3), - (["a", "ab", "b"], 4 / 3) + ("i am here", 7 / 3), + ("a ab b", 4 / 3) ] ) -def test_average_token_length_feature(words, expected_value): - doc = _create_spacy_doc(words) +def test_average_token_length_feature(spacy_en, text, expected_value): + doc = spacy_en(text) feature = AverageTokenLengthFeature() np.testing.assert_array_equal(feature.extract(doc), np.array([expected_value])) @pytest.mark.parametrize( - "words, word2freq, expected_value", + "text, word2freq, expected_value", [ - (["i", "am"], {"i": 4, "am": 3, "is": 2}, (np.log(4 / 9) + np.log(3 / 9)) / 2), - (["i", "are"], {"i": 4, "am": 3, "is": 2}, (np.log(4 / 9) + np.log(1 / 9)) / 2), # NOTE: OOV case + ("i am", {"i": 4, "am": 3, "is": 2}, (np.log(4 / 9) + np.log(3 / 9)) / 2), + ("i are", {"i": 4, "am": 3, "is": 2}, (np.log(4 / 9) + np.log(1 / 9)) / 2), # NOTE: OOV case ] ) -def test_unigram_likelihood_feature(words, word2freq, expected_value): - doc = _create_spacy_doc(words) +def test_unigram_likelihood_feature(spacy_en, text, word2freq, expected_value): + doc = spacy_en(text) feature = UnigramLikelihoodFeature(word2freq) np.testing.assert_array_equal(feature.extract(doc), np.array([expected_value]))