jbesomi · mk2510 · Aug 10, 2020 · Aug 10, 2020 · Aug 11, 2020
diff --git a/tests/test_representation.py b/tests/test_representation.py
@@ -234,3 +234,36 @@ def test_flatten_missing_row(self):
         pd.testing.assert_series_equal(
             representation.flatten(s, index=s_true.index), s_true, check_names=False
         )
+
+    """
+    Test drop duplicates
+    """
+
+    def test_drop_duplicates(self):
+        s = pd.Series(
+            ["I like football", "Hey, watch out", "I like sports", "Cool stuff"]
+        )
+        s_pca = (
+            s.pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+            .pipe(representation.flatten)
+            .pipe(representation.pca)
+        )
+        pd.testing.assert_series_equal(
+            representation.drop_duplicates(s, s_pca, 1),
+            pd.Series(
+                ["I like football", "Hey, watch out", "Cool stuff"], index=[0, 1, 3]
+            ),
+        )
+
+    def test_keep_duplicates(self):
+        s = pd.Series(
+            ["I like football", "Hey, watch out", "I like sports", "Cool stuff"]
+        )
+        s_pca = (
+            s.pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+            .pipe(representation.flatten)
+            .pipe(representation.pca)
+        )
+        pd.testing.assert_series_equal(representation.drop_duplicates(s, s_pca, 0), s)
diff --git a/texthero/representation.py b/texthero/representation.py
@@ -10,12 +10,14 @@
 from sklearn.decomposition import PCA, NMF
 from sklearn.cluster import KMeans, DBSCAN, MeanShift
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics import pairwise_distances
 from sklearn.preprocessing import normalize as sklearn_normalize
 from scipy.sparse import coo_matrix
 
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, List
 
 from texthero import preprocessing
+from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries
 
 import logging
 import warnings
@@ -1019,3 +1021,85 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
     s_result.index = s.index
 
     return s_result
+
+
+@InputSeries(TextSeries)
+def drop_duplicates(
+    s: TextSeries,
+    s_represented: Union[VectorSeries, RepresentationSeries],
+    threshold=1,
+) -> TextSeries:
+    """
+    Remove duplicates in a series.
+
+    To drop the most similar documents from a series, first represent
+    the Pandas Series with the documents, e.g. with
+    :meth:`hero.representation.tfidf`_ . Then use this function
+    to drop the most similar documents according to the representation.
+
+    Internally, euclidian distance is used to judge similarity.
+
+    Series s can either be a :class:`texthero._types.RepresentationSeries`
+    or a :class:`texthero._types.VectorSeries`.
+
+    Parameters
+    ----------
+    s : :class:`texthero._types.TextSeries` 
+        The Series in which we want to find similar documents.
+
+    s_represented : :class:`texthero._types.RepresentationSeries` or 
+                    :class:`texthero._types.VectorSeries`
+        The Series by which the similarity is calculated.
+
+    threshold: float
+        The threshold by which it is judge how similar two documents are
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
+    >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
+    >>> # want to remove a duplicate, in this case "I like sports" and "I like football" are 
+    >>> # considered as one
+    >>> drop_duplicates = hero.drop_duplicates(s, s_pca, 1)
+    >>> drop_duplicates
+    0    I like football
+    1     Hey, watch out
+    3         Cool stuff
+    dtype: object
+
+    """
+    if _check_is_valid_representation(s_represented):
+        if pd.api.types.is_sparse(s_represented):
+            s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
+        else:
+            s_represented = s_represented.astype("Sparse")
+            s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
+
+        s_represented_for_vectorization = s_represented_coo_matrix
+
+    else:
+        s_represented_for_vectorization = list(s_represented)
+
+    # calculating the distance between those vectors and returns the distances saved in a matrix
+    distance_matrix = pairwise_distances(s_represented_for_vectorization)
+
+    list_index_remove = []
+    set_index_remove = set()
+
+    for i in range(distance_matrix.shape[0]):
+        # if i is in the remove set, then we want to ignore it, so we won't remove a chain of vectors; e. g.:
+        # {[3], [3.5] [4]} and threshold is 0.75, then without it, we would remove the last two, so we just remove
+        # the second vector.
+        if i not in set_index_remove:
+            # as matrix is symmetric, we just need to take care of the 'bigger' indexes
+            for j in range(i + 1, distance_matrix.shape[0]):
+                if distance_matrix[i][j] <= threshold:
+                    list_index_remove.append(j)
+                    set_index_remove.add(j)
+
+    # convert list to pandas series, in order to use the beauty of masks
+    s_part_will_be_droped = s.take(list_index_remove)
+    drop_mask = ~s.index.isin(s_part_will_be_droped.index)
+    return s[drop_mask]