diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index d6103b02..1ec34f69 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,10 +38,12 @@ install_requires = unidecode>=1.1.1 gensim>=3.6.0 matplotlib>=3.1.0 + jinja2>=2.11.1 + # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 diff --git a/tests/test_visualization.py b/tests/test_visualization.py index d0075389..77150744 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -79,3 +79,11 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test show_dataframe. + """ + + def test_show_dataframe(self): + df = pd.DataFrame([["Test", 0.5], ["ja", 0.3]]) + self.assertIsNotNone(visualization.show_dataframe(df, return_HTML=True)) diff --git a/texthero/__init__.py b/texthero/__init__.py index 66e891e9..d5af94f0 100644 --- a/texthero/__init__.py +++ b/texthero/__init__.py @@ -16,3 +16,6 @@ from .nlp import * from . import stopwords + +from . import visualization_server +from .visualization_server import * diff --git a/texthero/visualization.py b/texthero/visualization.py index 73cc57f3..9abb02c6 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -2,20 +2,24 @@ Visualize insights and statistics of a text-based Pandas DataFrame. """ +import os import pandas as pd import numpy as np import plotly.express as px +import warnings from wordcloud import WordCloud from texthero import preprocessing from texthero._types import TextSeries, InputSeries -import string +from texthero.visualization_server import _display_df_browser +from texthero import visualization_server from matplotlib.colors import LinearSegmentedColormap as lsg import matplotlib.pyplot as plt from collections import Counter +import string def scatterplot( @@ -306,3 +310,77 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def show_dataframe( + df: pd.DataFrame, notebook=True, ip="127.0.0.1", port=8888, return_HTML=False +): + """ + Visualize a Pandas DataFrame. + + To embed the visualization inside + a Jupyter Notebook (e.g. Google Colab, Kaggle), + set `notebook=True` (default). To visualize + in a separate browser window, set it to + False. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to visualize. + + notebook : bool, default to True + Whether to visualize inside the + current Jupyter Notebook or in + a separate browser window. + + ip : string, default = '127.0.0.1' + The ip address used for the local server. + Ignored when notebook is set to True. + + port : int, default = 8888 + The port number to use for the local server. + If already in use, + a nearby open port will be found. + Ignored when notebook is set to True. + + return_HTML : bool, default to False + Whether to return the generated HTML + instead of visualizing it. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") # doctest: +SKIP + >>> hero.show_dataframe(df) # doctest: +SKIP + + """ + + if return_HTML: + return visualization_server.data_to_html(df) + + if notebook: + # Try to check whether the user is in a notebook. + # (Not a safe check.) + try: + __IPYTHON__ + import IPython + except: + warnings.warn( + "You do not appear do be inside" + " a Jupyter Notebook. Set" + " notebook=False to show the visualization." + " If you can already see the visualization, " + " ignore this warning.", + RuntimeWarning, + ) + + return IPython.display.display( + IPython.display.HTML(visualization_server.data_to_html(df)) + ) + + else: + _display_df_browser( + df, ip=ip, port=port, + ) diff --git a/texthero/visualization_server/__init__.py b/texthero/visualization_server/__init__.py new file mode 100644 index 00000000..66be52c4 --- /dev/null +++ b/texthero/visualization_server/__init__.py @@ -0,0 +1,9 @@ +""" +Submodule for our more complex visualizations that +run interactively. +""" + +from ._display import * +from ._display import _display_df_browser + +from ._server import * diff --git a/texthero/visualization_server/_display.py b/texthero/visualization_server/_display.py new file mode 100644 index 00000000..51c80604 --- /dev/null +++ b/texthero/visualization_server/_display.py @@ -0,0 +1,112 @@ +""" +Module to display our visualizations interactively +inside a Notebook / Browser. + +This file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_display.py +Copyright (c) 2013, Jake Vanderplas. +It was adapted for pyLDAvis by Ben Mabey. +It was then adapted for Texthero. +""" + +import json +import jinja2 +from ._server import serve + + +# Our HTML template. We use jinja2 +# to programmatically insert the +# data we want to visualize +# in the function data_to_html +# below. +HTML_TEMPLATE = jinja2.Template( + r""" + + +
+ + + + + +