Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ClusterSeries to Hero Series Types #170

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
added MultiIndex DF support
suport MultiIndex as function parameter

returns MultiIndex, where Representation was returned

* missing: correct test


Co-authored-by: Henri Froese <hf2000510@gmail.com>
mk2510 and henrifroese committed Aug 18, 2020
commit fa342a92d4f007cebfce29f1f22a4e31fedc56c6
18 changes: 3 additions & 15 deletions tests/test_indexes.py
Original file line number Diff line number Diff line change
@@ -56,21 +56,9 @@
]

test_cases_representation = [
[
"count",
lambda x: representation.flatten(representation.count(x)),
(s_tokenized_lists,),
],
[
"term_frequency",
lambda x: representation.flatten(representation.term_frequency(x)),
(s_tokenized_lists,),
],
[
"tfidf",
lambda x: representation.flatten(representation.tfidf(x)),
(s_tokenized_lists,),
],
["count", representation.count, (s_tokenized_lists,),],
["term_frequency", representation.term_frequency, (s_tokenized_lists,),],
["tfidf", representation.tfidf, (s_tokenized_lists,),],
["pca", representation.pca, (s_numeric_lists, 0)],
["nmf", representation.nmf, (s_numeric_lists,)],
["tsne", representation.tsne, (s_numeric_lists,)],
63 changes: 2 additions & 61 deletions tests/test_representation.py
Original file line number Diff line number Diff line change
@@ -50,16 +50,9 @@ def _tfidf(term, corpus, document_index):
[["Test", "Test", "TEST", "!"], ["Test", "?", ".", "."]], index=[5, 7]
)

s_tokenized_output_index = pd.MultiIndex.from_tuples(
[(0, "!"), (0, "TEST"), (0, "Test"), (1, "."), (1, "?"), (1, "Test")],
)

s_tokenized_output_noncontinuous_index = pd.MultiIndex.from_tuples(
[(5, "!"), (5, "TEST"), (5, "Test"), (7, "."), (7, "?"), (7, "Test")],
)

s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)
s_tokenized_output_index = [0,1]

s_tokenized_output_index_noncontinous = [5,7]

test_cases_vectorization = [
# format: [function_name, function, correct output for tokenized input above, dtype of output]
@@ -182,55 +175,3 @@ def test_tfidf_formula(self):
).astype("Sparse")

self.assertEqual(representation.tfidf(s), s_true)

"""
flatten.
"""

def test_flatten(self):
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)

s_true = pd.Series(
[[3.0, 0.0, np.nan], [0.0, 4.0, 0.0]], index=["doc0", "doc1"],
)

pd.testing.assert_series_equal(
representation.flatten(s), s_true, check_names=False
)

def test_flatten_fill_missing_with(self):
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)

s_true = pd.Series(
[[3.0, "FILLED", np.nan], ["FILLED", 4.0, "FILLED"]],
index=["doc0", "doc1"],
)

pd.testing.assert_series_equal(
representation.flatten(s, fill_missing_with="FILLED"),
s_true,
check_names=False,
)

def test_flatten_missing_row(self):
# Simulating a row with no features, so it's completely missing from
# the representation series.
index = pd.MultiIndex.from_tuples(
[("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")],
)
s = pd.Series([3, np.nan, 4], index=index)

s_true = pd.Series(
[[3.0, 0.0, np.nan], [0.0, 4.0, 0.0], [0.0, 0.0, 0.0]],
index=["doc0", "doc1", "doc2"],
)

pd.testing.assert_series_equal(
representation.flatten(s, index=s_true.index), s_true, check_names=False
)
294 changes: 108 additions & 186 deletions texthero/representation.py
Original file line number Diff line number Diff line change
@@ -27,90 +27,14 @@
"""


def flatten(
s: Union[pd.Series, pd.Series.sparse],
index: pd.Index = None,
fill_missing_with: Any = 0.0,
) -> pd.Series:
"""
Transform a Pandas Representation Series to a "normal" (flattened) Pandas Series.
The given Series should have a multiindex with first level being the document
and second level being individual features of that document (e.g. tdidf scores per word).
The flattened Series has one cell per document, with the cell being a list of all
the individual features of that document.
Parameters
----------
s : Sparse Pandas Series or Pandas Series
The multiindexed Pandas Series to flatten.
index : Pandas Index, optional, default to None
The index the flattened Series should have.
fill_missing_with : Any, default to 0.0
Value to fill the NaNs (missing values) with. This _does not_ mean
that existing values that are np.nan are replaced, but rather that
features that are not present in one document but present in others
are filled with fill_missing_with. See example below.
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> import numpy as np
>>> index = pd.MultiIndex.from_tuples([("doc0", "Word1"), ("doc0", "Word3"), ("doc1", "Word2")], names=['document', 'word'])
>>> s = pd.Series([3, np.nan, 4], index=index)
>>> s
document word
doc0 Word1 3.0
Word3 NaN
doc1 Word2 4.0
dtype: float64
>>> hero.flatten(s, fill_missing_with=0.0)
document
doc0 [3.0, 0.0, nan]
doc1 [0.0, 4.0, 0.0]
dtype: object
"""
s = s.unstack(fill_value=fill_missing_with)

if index is not None:
s = s.reindex(index, fill_value=fill_missing_with)
# Reindexing makes the documents for which no values
# are present in the Sparse Representation Series
# "reappear" correctly.

s = pd.Series(s.values.tolist(), index=s.index)

return s


def _check_is_valid_representation(s: pd.Series) -> bool:
def _check_is_valid_DocumentTermDF(df: Union[pd.DataFrame, pd.Series]) -> bool:
"""
Check if the given Pandas Series is a Document Representation Series.
Check if the given Pandas Series is a Document Term DF.
Returns true if Series is Document Representation Series, else False.
Returns true if input is Document Term DF, else False.
"""

# TODO: in Version 2 when only representation is accepted as input -> change "return False" to "raise ValueError"

if not isinstance(s.index, pd.MultiIndex):
return False
# raise ValueError(
# f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
# )

if s.index.nlevels != 2:
return False
# raise ValueError(
# f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
# )

return True
return isinstance(df, pd.DataFrame) and isinstance(df.columns, pd.MultiIndex)


# Warning message for not-tokenized inputs
@@ -132,22 +56,18 @@ def count(
min_df=1,
max_df=1.0,
binary=False,
) -> pd.Series:
) -> pd.DataFrame:
"""
Represent a text-based Pandas Series using count.
Return a Document Representation Series with the
Return a Document Term DataFrame with the
number of occurences of a document's words for every
document.
TODO add tutorial link
The input Series should already be tokenized. If not, it will
be tokenized before count is calculated.
Use :meth:`hero.representation.flatten` on the output to get
a standard Pandas Series with the document vectors
in every cell.
Parameters
----------
s : Pandas Series (tokenized)
@@ -177,15 +97,14 @@ def count(
>>> import pandas as pd
>>> s = pd.Series(["Sentence one", "Sentence two"]).pipe(hero.tokenize)
>>> hero.count(s)
0 Sentence 1
one 1
1 Sentence 1
two 1
dtype: Sparse[int64, 0]
count
Sentence one two
0 1 1 0
1 1 0 1
See Also
--------
Document Representation Series: TODO add tutorial link
Document Term DataFrame: TODO add tutorial link
"""
# TODO. Can be rewritten without sklearn.

@@ -204,37 +123,30 @@ def count(
)

tf_vectors_csr = tf.fit_transform(s)
tf_vectors_coo = coo_matrix(tf_vectors_csr)

s_out = pd.Series.sparse.from_coo(tf_vectors_coo)

features_names = tf.get_feature_names()

# Map word index to word name
s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))
multiindexed_columns = pd.MultiIndex.from_tuples(
[("count", word) for word in tf.get_feature_names()]
)

return s_out
return pd.DataFrame.sparse.from_spmatrix(
tf_vectors_csr, s.index, multiindexed_columns
)


def term_frequency(
s: pd.Series, max_features: Optional[int] = None, min_df=1, max_df=1.0,
) -> pd.Series:
) -> pd.DataFrame:
"""
Represent a text-based Pandas Series using term frequency.
Return a Document Representation Series with the
Return a Document Term DataFrame with the
term frequencies of the terms for every
document.
TODO add tutorial link
The input Series should already be tokenized. If not, it will
be tokenized before term_frequency is calculated.
Use :meth:`hero.representation.flatten` on the output to get
a standard Pandas Series with the document vectors
in every cell.
Parameters
----------
s : Pandas Series (tokenized)
@@ -261,16 +173,14 @@ def term_frequency(
>>> import pandas as pd
>>> s = pd.Series(["Sentence one hey", "Sentence two"]).pipe(hero.tokenize)
>>> hero.term_frequency(s)
0 Sentence 0.2
hey 0.2
one 0.2
1 Sentence 0.2
two 0.2
dtype: Sparse[float64, nan]
term_frequency
Sentence hey one two
0 0.2 0.2 0.2 0.0
1 0.2 0.0 0.0 0.2
See Also
--------
Document Representation Series: TODO add tutorial link
Document Term DataFrame: TODO add tutorial link
"""
# Check if input is tokenized. Else, print warning and tokenize.
if not isinstance(s.iloc[0], list):
@@ -291,17 +201,16 @@ def term_frequency(
total_count_coo = np.sum(tf_vectors_coo)
frequency_coo = np.divide(tf_vectors_coo, total_count_coo)

s_out = pd.Series.sparse.from_coo(frequency_coo)

features_names = tf.get_feature_names()

# Map word index to word name
s_out.index = s_out.index.map(lambda x: (s.index[x[0]], features_names[x[1]]))
multiindexed_columns = pd.MultiIndex.from_tuples(
[("term_frequency", word) for word in tf.get_feature_names()]
)

return s_out
return pd.DataFrame.sparse.from_spmatrix(
frequency_coo, s.index, multiindexed_columns
)


def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series:
def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.DataFrame:
"""
Represent a text-based Pandas Series using TF-IDF.
@@ -324,20 +233,13 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series:
so the result is exactly what you
get applying the formula described above.
Return a Document Representation Series with the
Return a Document Term DataFrame with the
tfidf of every word in the document.
TODO add tutorial link
The input Series should already be tokenized. If not, it will
be tokenized before tfidf is calculated.
If working with big pandas Series, you might want to limit
the number of features through the max_features parameter.
Use :meth:`hero.representation.flatten` on the output to get
a standard Pandas Series with the document vectors
in every cell.
Parameters
----------
s : Pandas Series (tokenized)
@@ -365,17 +267,16 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series:
>>> import pandas as pd
>>> s = pd.Series(["Hi Bye", "Test Bye Bye"]).pipe(hero.tokenize)
>>> hero.tfidf(s)
0 Bye 1.000000
Hi 1.405465
1 Bye 2.000000
Test 1.405465
dtype: Sparse[float64, nan]
tfidf
Bye Hi Test
0 1.0 1.405465 0.000000
1 2.0 0.000000 1.405465
See Also
--------
`TF-IDF on Wikipedia <https://en.wikipedia.org/wiki/Tf-idf>`_
Document Representation Series: TODO add tutorial link
Document Term DataFrame: TODO add tutorial link
"""

# Check if input is tokenized. Else, print warning and tokenize.
@@ -395,24 +296,23 @@ def tfidf(s: pd.Series, max_features=None, min_df=1, max_df=1.0,) -> pd.Series:

tfidf_vectors_csr = tfidf.fit_transform(s)

# Result from sklearn is in Compressed Sparse Row format.
# Pandas Sparse Series can only be initialized from Coordinate format.
tfidf_vectors_coo = coo_matrix(tfidf_vectors_csr)
s_out = pd.Series.sparse.from_coo(tfidf_vectors_coo)

# Map word index to word name and keep original index of documents.
feature_names = tfidf.get_feature_names()
s_out.index = s_out.index.map(lambda x: (s.index[x[0]], feature_names[x[1]]))
multiindexed_columns = pd.MultiIndex.from_tuples(
[("tfidf", word) for word in tfidf.get_feature_names()]
)

return s_out
return pd.DataFrame.sparse.from_spmatrix(
tfidf_vectors_csr, s.index, multiindexed_columns
)


"""
Dimensionality reduction
"""


def pca(s, n_components=2, random_state=None) -> pd.Series:
def pca(
s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None
) -> pd.Series:
"""
Perform principal component analysis on the given Pandas Series.
@@ -434,7 +334,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
Parameters
----------
s : Pandas Series
s : Pandas Series or MuliIndex Sparse DataFrame
n_components : Int. Default is 2.
Number of components to keep (dimensionality of output vectors).
@@ -468,10 +368,18 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
"""
pca = PCA(n_components=n_components, random_state=random_state, copy=False)
return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index)

if _check_is_valid_DocumentTermDF(s):
values = s.values
else:
values = list(s)

return pd.Series(pca.fit_transform(values).tolist(), index=s.index)


def nmf(s, n_components=2, random_state=None) -> pd.Series:
def nmf(
s: Union[pd.Series, pd.DataFrame], n_components=2, random_state=None
) -> pd.Series:
"""
Performs non-negative matrix factorization.
@@ -491,7 +399,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
Parameters
----------
s : Pandas Series
s : Pandas Series or Pandas MultiIndex Sparse DataFrame
n_components : Int. Default is 2.
Number of components to keep (dimensionality of output vectors).
@@ -527,11 +435,17 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
"""
nmf = NMF(n_components=n_components, init="random", random_state=random_state,)
return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index)

if _check_is_valid_DocumentTermDF(s):
values = s.sparse.to_coo()
else:
values = list(s)

return pd.Series(nmf.fit_transform(values).tolist(), index=s.index)


def tsne(
s: pd.Series,
s: Union[pd.Series, pd.DataFrame],
n_components=2,
perplexity=30.0,
learning_rate=200.0,
@@ -557,7 +471,7 @@ def tsne(
Parameters
----------
s : Pandas Series
s : Pandas Series or Pandas MultiIndex Sparse DataFrame
n_components : int, default is 2.
Number of components to keep (dimensionality of output vectors).
@@ -619,7 +533,13 @@ def tsne(
random_state=random_state,
n_jobs=n_jobs,
)
return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index)

if _check_is_valid_DocumentTermDF(s):
values = s.sparse.to_coo()
else:
values = list(s)

return pd.Series(tsne.fit_transform(values).tolist(), index=s.index)


"""
@@ -628,7 +548,7 @@ def tsne(


def kmeans(
s: pd.Series,
s: Union[pd.Series, pd.DataFrame],
n_clusters=5,
n_init=10,
max_iter=300,
@@ -653,7 +573,7 @@ def kmeans(
Parameters
----------
s: Pandas Series
s: Pandas Series or Pandas MultiIndex Sparse DataFrame
n_clusters: Int, default to 5.
The number of clusters to separate the data into.
@@ -686,7 +606,7 @@ def kmeans(
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"])
>>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten
>>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency)
>>> hero.kmeans(s, n_clusters=2, random_state=42)
0 1
1 0
@@ -702,7 +622,12 @@ def kmeans(
`kmeans on Wikipedia <https://en.wikipedia.org/wiki/K-means_clustering>`_
"""
vectors = list(s)

if _check_is_valid_DocumentTermDF(s):
vectors = s.sparse.to_coo()
else:
vectors = list(s)

kmeans = KMeans(
n_clusters=n_clusters,
n_init=n_init,
@@ -715,7 +640,7 @@ def kmeans(


def dbscan(
s,
s: Union[pd.Series, pd.DataFrame],
eps=0.5,
min_samples=5,
metric="euclidean",
@@ -743,7 +668,7 @@ def dbscan(
Parameters
----------
s: Pandas Series
s: Pandas Series or Pandas MultiIndex Sparse DataFrame
eps : float, default=0.5
The maximum distance between two samples for one to be considered
@@ -783,7 +708,7 @@ def dbscan(
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, enjoy, guitar"])
>>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten) # TODO: when others get Representation Support: remove flatten
>>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.tfidf)
>>> hero.dbscan(s, min_samples=1, eps=4)
0 0
1 1
@@ -801,6 +726,11 @@ def dbscan(
"""

if _check_is_valid_DocumentTermDF(s):
vectors = s.sparse.to_coo()
else:
vectors = list(s)

return pd.Series(
DBSCAN(
eps=eps,
@@ -809,13 +739,13 @@ def dbscan(
metric_params=metric_params,
leaf_size=leaf_size,
n_jobs=n_jobs,
).fit_predict(list(s)),
).fit_predict(vectors),
index=s.index,
).astype("category")


def meanshift(
s,
s: Union[pd.Series, pd.DataFrame],
bandwidth=None,
bin_seeding=False,
min_bin_freq=1,
@@ -843,7 +773,7 @@ def meanshift(
Parameters
----------
s: Pandas Series
s: Pandas Series or Pandas MultiIndex Sparse DataFrame
bandwidth : float, default=None
Bandwidth used in the RBF kernel.
@@ -901,6 +831,11 @@ def meanshift(
"""

if _check_is_valid_DocumentTermDF(s):
vectors = s.values
else:
vectors = list(s)

return pd.Series(
MeanShift(
bandwidth=bandwidth,
@@ -909,7 +844,7 @@ def meanshift(
cluster_all=cluster_all,
n_jobs=n_jobs,
max_iter=max_iter,
).fit_predict(list(s)),
).fit_predict(vectors),
index=s.index,
).astype("category")

@@ -962,31 +897,18 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
`Norm on Wikipedia <https://en.wikipedia.org/wiki/Norm_(mathematics)>`_
"""
isDocumentTermDF = _check_is_valid_DocumentTermDF(s)

is_valid_representation = (
isinstance(s.index, pd.MultiIndex) and s.index.nlevels == 2
)

if not is_valid_representation:
raise TypeError(
"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex. The given Pandas Series does not appears to have MultiIndex"
)
# TODO after merging representation: use _check_is_valid_representation instead

if pd.api.types.is_sparse(s):
s_coo_matrix = s.sparse.to_coo()[0]
if isDocumentTermDF:
s_for_vectorization = s.sparse.to_coo()
else:
s = s.astype("Sparse")
s_coo_matrix = s.sparse.to_coo()[0]

s_for_vectorization = s_coo_matrix
s_for_vectorization = list(s)

result = sklearn_normalize(
s_for_vectorization, norm=norm
) # Can handle sparse input.

result_coo = coo_matrix(result)
s_result = pd.Series.sparse.from_coo(result_coo)
s_result.index = s.index

return s_result
if isDocumentTermDF:
return pd.DataFrame.sparse.from_spmatrix(result, s.index, s.columns)
else:
return pd.Series(result.tolist(), index=s.index)
4 changes: 2 additions & 2 deletions texthero/visualization.py
Original file line number Diff line number Diff line change
@@ -63,8 +63,8 @@ def scatterplot(
>>> import pandas as pd
>>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"])
>>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize)
>>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten
>>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten
>>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3)
>>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.kmeans, n_clusters=2)
>>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP
"""