Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add drop duplicates; closes #4 #150

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,36 @@ def test_flatten_missing_row(self):
pd.testing.assert_series_equal(
representation.flatten(s, index=s_true.index), s_true, check_names=False
)

"""
Test drop duplicates
"""

def test_drop_duplicates(self):
s = pd.Series(
["I like football", "Hey, watch out", "I like sports", "Cool stuff"]
)
s_pca = (
s.pipe(preprocessing.tokenize)
.pipe(representation.tfidf)
.pipe(representation.flatten)
.pipe(representation.pca)
)
pd.testing.assert_series_equal(
representation.drop_duplicates(s, s_pca, 1),
pd.Series(
["I like football", "Hey, watch out", "Cool stuff"], index=[0, 1, 3]
),
)

def test_keep_duplicates(self):
s = pd.Series(
["I like football", "Hey, watch out", "I like sports", "Cool stuff"]
)
s_pca = (
s.pipe(preprocessing.tokenize)
.pipe(representation.tfidf)
.pipe(representation.flatten)
.pipe(representation.pca)
)
pd.testing.assert_series_equal(representation.drop_duplicates(s, s_pca, 0), s)
86 changes: 85 additions & 1 deletion texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize as sklearn_normalize
from scipy.sparse import coo_matrix

from typing import Optional, Union, Any
from typing import Optional, Union, Any, List

from texthero import preprocessing
from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries

import logging
import warnings
Expand Down Expand Up @@ -1019,3 +1021,85 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
s_result.index = s.index

return s_result


@InputSeries(TextSeries)
def drop_duplicates(
s: TextSeries,
s_represented: Union[VectorSeries, RepresentationSeries],
threshold=1,
) -> TextSeries:
"""
Remove duplicates in a series.

To drop the most similar documents from a series, first represent
the Pandas Series with the documents, e.g. with
:meth:`hero.representation.tfidf`_ . Then use this function
to drop the most similar documents according to the representation.

Internally, euclidian distance is used to judge similarity.

Series s can either be a :class:`texthero._types.RepresentationSeries`
or a :class:`texthero._types.VectorSeries`.

Parameters
----------
s : :class:`texthero._types.TextSeries`
The Series in which we want to find similar documents.

s_represented : :class:`texthero._types.RepresentationSeries` or
:class:`texthero._types.VectorSeries`
The Series by which the similarity is calculated.

threshold: float
The threshold by which it is judge how similar two documents are

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
>>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
>>> # want to remove a duplicate, in this case "I like sports" and "I like football" are
>>> # considered as one
>>> drop_duplicates = hero.drop_duplicates(s, s_pca, 1)
>>> drop_duplicates
0 I like football
1 Hey, watch out
3 Cool stuff
dtype: object

"""
if _check_is_valid_representation(s_represented):
if pd.api.types.is_sparse(s_represented):
s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
else:
s_represented = s_represented.astype("Sparse")
s_represented_coo_matrix = s_represented.sparse.to_coo()[0]

s_represented_for_vectorization = s_represented_coo_matrix

else:
s_represented_for_vectorization = list(s_represented)

# calculating the distance between those vectors and returns the distances saved in a matrix
distance_matrix = pairwise_distances(s_represented_for_vectorization)

list_index_remove = []
set_index_remove = set()

for i in range(distance_matrix.shape[0]):
# if i is in the remove set, then we want to ignore it, so we won't remove a chain of vectors; e. g.:
# {[3], [3.5] [4]} and threshold is 0.75, then without it, we would remove the last two, so we just remove
# the second vector.
if i not in set_index_remove:
# as matrix is symmetric, we just need to take care of the 'bigger' indexes
for j in range(i + 1, distance_matrix.shape[0]):
if distance_matrix[i][j] <= threshold:
list_index_remove.append(j)
set_index_remove.add(j)

# convert list to pandas series, in order to use the beauty of masks
s_part_will_be_droped = s.take(list_index_remove)
drop_mask = ~s.index.isin(s_part_will_be_droped.index)
return s[drop_mask]