From 0b8a828feb374d3baaa14c531b000496914850b7 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 10 Aug 2020 22:04:34 +0200 Subject: [PATCH 1/3] drop duplicates implemented missing: test and comments Co-authored-by: Henri Froese --- texthero/representation.py | 82 +++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/texthero/representation.py b/texthero/representation.py index ba6ebddb..01462a07 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -10,12 +10,14 @@ from sklearn.decomposition import PCA, NMF from sklearn.cluster import KMeans, DBSCAN, MeanShift from sklearn.metrics.pairwise import cosine_similarity +from sklearn.metrics import pairwise_distances from sklearn.preprocessing import normalize as sklearn_normalize from scipy.sparse import coo_matrix -from typing import Optional, Union, Any +from typing import Optional, Union, Any, List from texthero import preprocessing +from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries import logging import warnings @@ -1019,3 +1021,81 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: s_result.index = s.index return s_result +@InputSeries(TextSeries) +def drop_duplicates( + s: TextSeries, + s_represented: Union[VectorSeries, RepresentationSeries], + threshold=1, +) -> TextSeries: + """ + Return the most similar vectors in s to the given vector. + + To find the most similar documents to a document, first represent + the Pandas Series with the documents, e.g. with + :meth:`hero.representation.tfidf`_ . Then use this function + to find the most similar documents according to the representation. + Similar vectors are returned sorted by similarity descending. + + Internally, euclidian distance is used to judge similarity. + + Series s can either be a :class:`texthero._types.RepresentationSeries` + or a :class:`texthero._types.VectorSeries`. + + Parameters + ---------- + s : :class:`texthero._types.TextSeries` + The Series in which we want to find similar documents. + + s_represented : :class:`texthero._types.RepresentationSeries` or + :class:`texthero._types.VectorSeries` + The Series by which the similarity is calculated. + + vector : List[float] + The vector to which we want to find the most similar documents. + + max_number: int or None, default 100 + Maximum amount of indexes of similar documents to return. + If None, returns all . + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"]) + >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series + >>> # want to find the two most similar to "I like football", which has index 0 + >>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2) + >>> s_most_similar + 0 I like football + 2 I like sports + dtype: object + + """ + if _check_is_valid_representation(s_represented): + if pd.api.types.is_sparse(s_represented): + s_represented_coo_matrix = s_represented.sparse.to_coo()[0] + else: + s_represented = s_represented.astype("Sparse") + s_represented_coo_matrix = s_represented.sparse.to_coo()[0] + + s_represented_for_vectorization = s_represented_coo_matrix + + else: + s_represented_for_vectorization = list(s_represented) + + distance_matrix = pairwise_distances( + s_represented_for_vectorization + ) + + list_index_remove = [] + set_index_remove = set() + for i in range(distance_matrix.shape[0]): + if i not in set_index_remove: + for j in range(i+1, distance_matrix.shape[0]): + if distance_matrix[i][j] <= threshold: + list_index_remove.append(j) + set_index_remove.add(j) + + s_part_will_be_droped = s.take(list_index_remove) + drop_mask = ~s.index.isin(s_part_will_be_droped.index) + return s[drop_mask] \ No newline at end of file From a47bef5d444ddd2f759d9391c27f912726a5c3b8 Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Mon, 10 Aug 2020 22:26:28 +0200 Subject: [PATCH 2/3] added comments missing: tests --- texthero/representation.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/texthero/representation.py b/texthero/representation.py index 01462a07..632e7994 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1028,13 +1028,12 @@ def drop_duplicates( threshold=1, ) -> TextSeries: """ - Return the most similar vectors in s to the given vector. + Remove duplicates in a series. - To find the most similar documents to a document, first represent + To drop the most similar documents from a series, first represent the Pandas Series with the documents, e.g. with :meth:`hero.representation.tfidf`_ . Then use this function - to find the most similar documents according to the representation. - Similar vectors are returned sorted by similarity descending. + to drop the most similar documents according to the representation. Internally, euclidian distance is used to judge similarity. @@ -1050,12 +1049,8 @@ def drop_duplicates( :class:`texthero._types.VectorSeries` The Series by which the similarity is calculated. - vector : List[float] - The vector to which we want to find the most similar documents. - - max_number: int or None, default 100 - Maximum amount of indexes of similar documents to return. - If None, returns all . + threshold: float + The threshold by which it is judge how similar two documents are Examples -------- @@ -1063,11 +1058,12 @@ def drop_duplicates( >>> import pandas as pd >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"]) >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series - >>> # want to find the two most similar to "I like football", which has index 0 - >>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2) - >>> s_most_similar - 0 I like football - 2 I like sports + >>> # want to remove a duplicate, in this case "I like sports" and "I like football" are + >>> # considered as one + >>> drop_duplicates = hero.drop_duplicates(s, s_pca, 1) + >>> 0 I like football + 1 Hey, watch out + 3 Cool stuff dtype: object """ @@ -1083,19 +1079,26 @@ def drop_duplicates( else: s_represented_for_vectorization = list(s_represented) + # calculating the distance between those vectors and returns the distances saved in a matrix distance_matrix = pairwise_distances( s_represented_for_vectorization ) list_index_remove = [] set_index_remove = set() + for i in range(distance_matrix.shape[0]): + # if i is in the remove set, then we want to ignore it, so we won't remove a chain of vectors; e. g.: + # {[3], [3.5] [4]} and threshold is 0.75, then without it, we would remove the last two, so we just remove + # the second vector. if i not in set_index_remove: + # as matrix is symmetric, we just need to take care of the 'bigger' indexes for j in range(i+1, distance_matrix.shape[0]): if distance_matrix[i][j] <= threshold: list_index_remove.append(j) set_index_remove.add(j) + # convert list to pandas series, in order to use the beauty of masks s_part_will_be_droped = s.take(list_index_remove) drop_mask = ~s.index.isin(s_part_will_be_droped.index) return s[drop_mask] \ No newline at end of file From 2af68889859ac53c134881853ed657afe5dbdf6a Mon Sep 17 00:00:00 2001 From: Maximilian Krahn Date: Tue, 11 Aug 2020 21:02:39 +0200 Subject: [PATCH 3/3] added unit tests --- tests/test_representation.py | 33 +++++++++++++++++++++++++++++++++ texthero/representation.py | 15 ++++++++------- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..c36c93df 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -234,3 +234,36 @@ def test_flatten_missing_row(self): pd.testing.assert_series_equal( representation.flatten(s, index=s_true.index), s_true, check_names=False ) + + """ + Test drop duplicates + """ + + def test_drop_duplicates(self): + s = pd.Series( + ["I like football", "Hey, watch out", "I like sports", "Cool stuff"] + ) + s_pca = ( + s.pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + .pipe(representation.flatten) + .pipe(representation.pca) + ) + pd.testing.assert_series_equal( + representation.drop_duplicates(s, s_pca, 1), + pd.Series( + ["I like football", "Hey, watch out", "Cool stuff"], index=[0, 1, 3] + ), + ) + + def test_keep_duplicates(self): + s = pd.Series( + ["I like football", "Hey, watch out", "I like sports", "Cool stuff"] + ) + s_pca = ( + s.pipe(preprocessing.tokenize) + .pipe(representation.tfidf) + .pipe(representation.flatten) + .pipe(representation.pca) + ) + pd.testing.assert_series_equal(representation.drop_duplicates(s, s_pca, 0), s) diff --git a/texthero/representation.py b/texthero/representation.py index 632e7994..86046b57 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -1021,6 +1021,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series: s_result.index = s.index return s_result + + @InputSeries(TextSeries) def drop_duplicates( s: TextSeries, @@ -1061,7 +1063,8 @@ def drop_duplicates( >>> # want to remove a duplicate, in this case "I like sports" and "I like football" are >>> # considered as one >>> drop_duplicates = hero.drop_duplicates(s, s_pca, 1) - >>> 0 I like football + >>> drop_duplicates + 0 I like football 1 Hey, watch out 3 Cool stuff dtype: object @@ -1080,10 +1083,8 @@ def drop_duplicates( s_represented_for_vectorization = list(s_represented) # calculating the distance between those vectors and returns the distances saved in a matrix - distance_matrix = pairwise_distances( - s_represented_for_vectorization - ) - + distance_matrix = pairwise_distances(s_represented_for_vectorization) + list_index_remove = [] set_index_remove = set() @@ -1093,7 +1094,7 @@ def drop_duplicates( # the second vector. if i not in set_index_remove: # as matrix is symmetric, we just need to take care of the 'bigger' indexes - for j in range(i+1, distance_matrix.shape[0]): + for j in range(i + 1, distance_matrix.shape[0]): if distance_matrix[i][j] <= threshold: list_index_remove.append(j) set_index_remove.add(j) @@ -1101,4 +1102,4 @@ def drop_duplicates( # convert list to pandas series, in order to use the beauty of masks s_part_will_be_droped = s.take(list_index_remove) drop_mask = ~s.index.isin(s_part_will_be_droped.index) - return s[drop_mask] \ No newline at end of file + return s[drop_mask]