From 0b8a828feb374d3baaa14c531b000496914850b7 Mon Sep 17 00:00:00 2001
From: Maximilian Krahn <maximilian.krahn@icloud.com>
Date: Mon, 10 Aug 2020 22:04:34 +0200
Subject: [PATCH 1/3] drop duplicates implemented

missing: test and comments


Co-authored-by: Henri Froese <hf2000510@gmail.com>
---
 texthero/representation.py | 82 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/texthero/representation.py b/texthero/representation.py
index ba6ebddb..01462a07 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -10,12 +10,14 @@
 from sklearn.decomposition import PCA, NMF
 from sklearn.cluster import KMeans, DBSCAN, MeanShift
 from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics import pairwise_distances
 from sklearn.preprocessing import normalize as sklearn_normalize
 from scipy.sparse import coo_matrix
 
-from typing import Optional, Union, Any
+from typing import Optional, Union, Any, List
 
 from texthero import preprocessing
+from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries
 
 import logging
 import warnings
@@ -1019,3 +1021,81 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
     s_result.index = s.index
 
     return s_result
+@InputSeries(TextSeries)
+def drop_duplicates(
+    s: TextSeries,
+    s_represented: Union[VectorSeries, RepresentationSeries],
+    threshold=1,
+) -> TextSeries:
+    """
+    Return the most similar vectors in s to the given vector.
+
+    To find the most similar documents to a document, first represent
+    the Pandas Series with the documents, e.g. with
+    :meth:`hero.representation.tfidf`_ . Then use this function
+    to find the most similar documents according to the representation.
+    Similar vectors are returned sorted by similarity descending.
+
+    Internally, euclidian distance is used to judge similarity.
+
+    Series s can either be a :class:`texthero._types.RepresentationSeries`
+    or a :class:`texthero._types.VectorSeries`.
+
+    Parameters
+    ----------
+    s : :class:`texthero._types.TextSeries` 
+        The Series in which we want to find similar documents.
+
+    s_represented : :class:`texthero._types.RepresentationSeries` or 
+                    :class:`texthero._types.VectorSeries`
+        The Series by which the similarity is calculated.
+
+    vector : List[float]
+        The vector to which we want to find the most similar documents.
+
+    max_number: int or None, default 100
+        Maximum amount of indexes of similar documents to return.
+        If None, returns all .
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
+    >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
+    >>> # want to find the two most similar to "I like football", which has index 0
+    >>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2)
+    >>> s_most_similar
+    0    I like football
+    2      I like sports
+    dtype: object
+
+    """
+    if _check_is_valid_representation(s_represented):
+        if pd.api.types.is_sparse(s_represented):
+            s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
+        else:
+            s_represented = s_represented.astype("Sparse")
+            s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
+
+        s_represented_for_vectorization = s_represented_coo_matrix
+
+    else:
+        s_represented_for_vectorization = list(s_represented)
+
+    distance_matrix = pairwise_distances(
+        s_represented_for_vectorization
+    )
+    
+    list_index_remove = []
+    set_index_remove = set()
+    for i in range(distance_matrix.shape[0]):
+        if i not in set_index_remove:
+            for j in range(i+1, distance_matrix.shape[0]):
+                if distance_matrix[i][j] <= threshold:
+                    list_index_remove.append(j)
+                    set_index_remove.add(j)
+
+    s_part_will_be_droped = s.take(list_index_remove)
+    drop_mask = ~s.index.isin(s_part_will_be_droped.index)
+    return s[drop_mask]
\ No newline at end of file

From a47bef5d444ddd2f759d9391c27f912726a5c3b8 Mon Sep 17 00:00:00 2001
From: Maximilian Krahn <maximilian.krahn@icloud.com>
Date: Mon, 10 Aug 2020 22:26:28 +0200
Subject: [PATCH 2/3] added comments

missing: tests
---
 texthero/representation.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/texthero/representation.py b/texthero/representation.py
index 01462a07..632e7994 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -1028,13 +1028,12 @@ def drop_duplicates(
     threshold=1,
 ) -> TextSeries:
     """
-    Return the most similar vectors in s to the given vector.
+    Remove duplicates in a series.
 
-    To find the most similar documents to a document, first represent
+    To drop the most similar documents from a series, first represent
     the Pandas Series with the documents, e.g. with
     :meth:`hero.representation.tfidf`_ . Then use this function
-    to find the most similar documents according to the representation.
-    Similar vectors are returned sorted by similarity descending.
+    to drop the most similar documents according to the representation.
 
     Internally, euclidian distance is used to judge similarity.
 
@@ -1050,12 +1049,8 @@ def drop_duplicates(
                     :class:`texthero._types.VectorSeries`
         The Series by which the similarity is calculated.
 
-    vector : List[float]
-        The vector to which we want to find the most similar documents.
-
-    max_number: int or None, default 100
-        Maximum amount of indexes of similar documents to return.
-        If None, returns all .
+    threshold: float
+        The threshold by which it is judge how similar two documents are
 
     Examples
     --------
@@ -1063,11 +1058,12 @@ def drop_duplicates(
     >>> import pandas as pd
     >>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
     >>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
-    >>> # want to find the two most similar to "I like football", which has index 0
-    >>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2)
-    >>> s_most_similar
-    0    I like football
-    2      I like sports
+    >>> # want to remove a duplicate, in this case "I like sports" and "I like football" are 
+    >>> # considered as one
+    >>> drop_duplicates = hero.drop_duplicates(s, s_pca, 1)
+    >>> 0    I like football
+    1     Hey, watch out
+    3         Cool stuff
     dtype: object
 
     """
@@ -1083,19 +1079,26 @@ def drop_duplicates(
     else:
         s_represented_for_vectorization = list(s_represented)
 
+    # calculating the distance between those vectors and returns the distances saved in a matrix
     distance_matrix = pairwise_distances(
         s_represented_for_vectorization
     )
     
     list_index_remove = []
     set_index_remove = set()
+
     for i in range(distance_matrix.shape[0]):
+        # if i is in the remove set, then we want to ignore it, so we won't remove a chain of vectors; e. g.:
+        # {[3], [3.5] [4]} and threshold is 0.75, then without it, we would remove the last two, so we just remove
+        # the second vector.
         if i not in set_index_remove:
+            # as matrix is symmetric, we just need to take care of the 'bigger' indexes
             for j in range(i+1, distance_matrix.shape[0]):
                 if distance_matrix[i][j] <= threshold:
                     list_index_remove.append(j)
                     set_index_remove.add(j)
 
+    # convert list to pandas series, in order to use the beauty of masks
     s_part_will_be_droped = s.take(list_index_remove)
     drop_mask = ~s.index.isin(s_part_will_be_droped.index)
     return s[drop_mask]
\ No newline at end of file

From 2af68889859ac53c134881853ed657afe5dbdf6a Mon Sep 17 00:00:00 2001
From: Maximilian Krahn <maximilian.krahn@icloud.com>
Date: Tue, 11 Aug 2020 21:02:39 +0200
Subject: [PATCH 3/3] added unit tests

---
 tests/test_representation.py | 33 +++++++++++++++++++++++++++++++++
 texthero/representation.py   | 15 ++++++++-------
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/tests/test_representation.py b/tests/test_representation.py
index 036775af..c36c93df 100644
--- a/tests/test_representation.py
+++ b/tests/test_representation.py
@@ -234,3 +234,36 @@ def test_flatten_missing_row(self):
         pd.testing.assert_series_equal(
             representation.flatten(s, index=s_true.index), s_true, check_names=False
         )
+
+    """
+    Test drop duplicates
+    """
+
+    def test_drop_duplicates(self):
+        s = pd.Series(
+            ["I like football", "Hey, watch out", "I like sports", "Cool stuff"]
+        )
+        s_pca = (
+            s.pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+            .pipe(representation.flatten)
+            .pipe(representation.pca)
+        )
+        pd.testing.assert_series_equal(
+            representation.drop_duplicates(s, s_pca, 1),
+            pd.Series(
+                ["I like football", "Hey, watch out", "Cool stuff"], index=[0, 1, 3]
+            ),
+        )
+
+    def test_keep_duplicates(self):
+        s = pd.Series(
+            ["I like football", "Hey, watch out", "I like sports", "Cool stuff"]
+        )
+        s_pca = (
+            s.pipe(preprocessing.tokenize)
+            .pipe(representation.tfidf)
+            .pipe(representation.flatten)
+            .pipe(representation.pca)
+        )
+        pd.testing.assert_series_equal(representation.drop_duplicates(s, s_pca, 0), s)
diff --git a/texthero/representation.py b/texthero/representation.py
index 632e7994..86046b57 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -1021,6 +1021,8 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
     s_result.index = s.index
 
     return s_result
+
+
 @InputSeries(TextSeries)
 def drop_duplicates(
     s: TextSeries,
@@ -1061,7 +1063,8 @@ def drop_duplicates(
     >>> # want to remove a duplicate, in this case "I like sports" and "I like football" are 
     >>> # considered as one
     >>> drop_duplicates = hero.drop_duplicates(s, s_pca, 1)
-    >>> 0    I like football
+    >>> drop_duplicates
+    0    I like football
     1     Hey, watch out
     3         Cool stuff
     dtype: object
@@ -1080,10 +1083,8 @@ def drop_duplicates(
         s_represented_for_vectorization = list(s_represented)
 
     # calculating the distance between those vectors and returns the distances saved in a matrix
-    distance_matrix = pairwise_distances(
-        s_represented_for_vectorization
-    )
-    
+    distance_matrix = pairwise_distances(s_represented_for_vectorization)
+
     list_index_remove = []
     set_index_remove = set()
 
@@ -1093,7 +1094,7 @@ def drop_duplicates(
         # the second vector.
         if i not in set_index_remove:
             # as matrix is symmetric, we just need to take care of the 'bigger' indexes
-            for j in range(i+1, distance_matrix.shape[0]):
+            for j in range(i + 1, distance_matrix.shape[0]):
                 if distance_matrix[i][j] <= threshold:
                     list_index_remove.append(j)
                     set_index_remove.add(j)
@@ -1101,4 +1102,4 @@ def drop_duplicates(
     # convert list to pandas series, in order to use the beauty of masks
     s_part_will_be_droped = s.take(list_index_remove)
     drop_mask = ~s.index.isin(s_part_will_be_droped.index)
-    return s[drop_mask]
\ No newline at end of file
+    return s[drop_mask]