From 53632f07f1568e2813e02e6148cdce3565338128 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 4 Sep 2020 09:51:55 +0200 Subject: [PATCH 01/12] Fix travis version. See Issue #171 --- .travis.yml | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index d6103b02..3f86e7f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 From b329fb3b8821830ee1d8642e3b92fc7cfb8d22d7 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 4 Sep 2020 09:53:29 +0200 Subject: [PATCH 02/12] roll back accidental push to master --- .travis.yml | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index c76284b3..f913f183 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black==19.10b0 + - pip3 install black - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index 3f86e7f3..d6103b02 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ install_requires = # TODO pick the correct version. [options.extras_require] dev = - black==19.10b0 + black>=19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 From 3ec7ee8e0e5976fb9e0a2d0e4f516adbef8458fb Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 19:17:53 +0200 Subject: [PATCH 03/12] Implement show_dataframe. Co-authored-by: Maximilian Krahn --- .travis.yml | 2 +- setup.cfg | 4 +- tests/test_visualization.py | 8 ++ texthero/__init__.py | 3 + texthero/visualization.py | 77 +++++++++++- texthero/visualization_server/__init__.py | 9 ++ texthero/visualization_server/_display.py | 135 ++++++++++++++++++++++ texthero/visualization_server/_server.py | 93 +++++++++++++++ 8 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 texthero/visualization_server/__init__.py create mode 100644 texthero/visualization_server/_display.py create mode 100644 texthero/visualization_server/_server.py diff --git a/.travis.yml b/.travis.yml index f913f183..c76284b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ jobs: env: PATH=/c/Python38:/c/Python38/Scripts:$PATH install: - pip3 install --upgrade pip # all three OSes agree about 'pip3' - - pip3 install black + - pip3 install black==19.10b0 - pip3 install ".[dev]" . # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only diff --git a/setup.cfg b/setup.cfg index d6103b02..1ec34f69 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,10 +38,12 @@ install_requires = unidecode>=1.1.1 gensim>=3.6.0 matplotlib>=3.1.0 + jinja2>=2.11.1 + # TODO pick the correct version. [options.extras_require] dev = - black>=19.10b0 + black==19.10b0 pytest>=4.0.0 Sphinx>=3.0.3 sphinx-markdown-builder>=0.5.4 diff --git a/tests/test_visualization.py b/tests/test_visualization.py index d0075389..77150744 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -79,3 +79,11 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test show_dataframe. + """ + + def test_show_dataframe(self): + df = pd.DataFrame([["Test", 0.5], ["ja", 0.3]]) + self.assertIsNotNone(visualization.show_dataframe(df, return_HTML=True)) diff --git a/texthero/__init__.py b/texthero/__init__.py index 66e891e9..d5af94f0 100644 --- a/texthero/__init__.py +++ b/texthero/__init__.py @@ -16,3 +16,6 @@ from .nlp import * from . import stopwords + +from . import visualization_server +from .visualization_server import * diff --git a/texthero/visualization.py b/texthero/visualization.py index 73cc57f3..530dbdd7 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -2,20 +2,24 @@ Visualize insights and statistics of a text-based Pandas DataFrame. """ +import os import pandas as pd import numpy as np import plotly.express as px +import warnings from wordcloud import WordCloud from texthero import preprocessing from texthero._types import TextSeries, InputSeries -import string +from texthero.visualization_server import _display_df_notebook, _display_df_browser +from texthero import visualization_server from matplotlib.colors import LinearSegmentedColormap as lsg import matplotlib.pyplot as plt from collections import Counter +import string def scatterplot( @@ -306,3 +310,74 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def show_dataframe( + df: pd.DataFrame, notebook=True, ip="127.0.0.1", port=8888, return_HTML=False +): + """ + Visualize a Pandas DataFrame. + + To embed the visualization inside + a Jupyter Notebook (e.g. Google Colab, Kaggle), + set `notebook=True` (default). To visualize + in a separate browser window, set it to + False. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to visualize. + + notebook : bool, default to True + Whether to visualize inside the + current Jupyter Notebook or in + a separate browser window. + + ip : string, default = '127.0.0.1' + The ip address used for the local server. + Ignored when notebook is set to True. + + port : int, default = 8888 + The port number to use for the local server. + If already in use, + a nearby open port will be found. + Ignored when notebook is set to True. + + return_HTML : bool, default to False + Whether to return the generated HTML + instead of visualizing it. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") # doctest: +SKIP + >>> hero.show_dataframe(df) # doctest: +SKIP + + """ + + if return_HTML: + return visualization_server.data_to_html(df) + + if notebook: + # Try to check whether the user is in a notebook. + # (Not a safe check.) + try: + __IPYTHON__ + except: + warnings.warn( + "You do not appear do be inside" + " a Jupyter Notebook. Set" + " notebook=False to show the visualization." + " If you can already see the visualization, " + " ignore this warning.", + RuntimeWarning, + ) + + _display_df_notebook(df) + + else: + _display_df_browser( + df, ip=ip, port=port, + ) diff --git a/texthero/visualization_server/__init__.py b/texthero/visualization_server/__init__.py new file mode 100644 index 00000000..f9dbbc61 --- /dev/null +++ b/texthero/visualization_server/__init__.py @@ -0,0 +1,9 @@ +""" +Submodule for our more complex visualizations that +run interactively. +""" + +from ._display import * +from ._display import _display_df_browser, _display_df_notebook + +from ._server import * diff --git a/texthero/visualization_server/_display.py b/texthero/visualization_server/_display.py new file mode 100644 index 00000000..f00a02a0 --- /dev/null +++ b/texthero/visualization_server/_display.py @@ -0,0 +1,135 @@ +""" +Module to display our visualizations interactively +inside a Notebook / Browser. + +This file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_display.py +Copyright (c) 2013, Jake Vanderplas. +It was adapted for pyLDAvis by Ben Mabey. +It was then adapted for Texthero. +""" + +import json +import jinja2 +from ._server import serve + + +# Our HTML template. We use jinja2 +# to programmatically insert the +# data we want to visualize +# in the function data_to_html +# below. +HTML_TEMPLATE = jinja2.Template( + r""" + + + + + + + + +
+
+
+
+ +
+
+
+
+ + + + + + + +""" +) + + +def data_to_html(df): + """ + Output HTML with embedded visualization + of the DataFrame df. + + """ + template = HTML_TEMPLATE + + # Create JSON from DataFrame with correct classes/ID for visualization. + df_json = json.dumps( + df.to_html( + classes='table table-hover" id = "tableID', + index=False, + justify="left", + border=0, + ) + ) + + return template.render(df_json=df_json) + + +def _display_df_notebook(df): + """ + Display visualization of DataFrame `df` + in IPython notebook via the HTML display hook. + + Returns the IPython HTML rich display of the visualization. + + """ + # import here, in case users don't have requirements installed + try: + from IPython.display import HTML + except: + raise ValueError( + "You do not appear do be inside" + " a Jupyter Notebook. Set" + " notebook=False to show the visualization." + ) + + html = data_to_html(df) + + return HTML(html) + + +def _display_df_browser( + df, ip="127.0.0.1", port=8888, +): + """ + Display visualization of DataFrame `df` + in local browser. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to visualize. + + ip : string, default = '127.0.0.1' + The ip address used for the local server + + port : int, default = 8888 + The port number to use for the local server. + If already in use, + a nearby open port will be found. + + """ + + html = data_to_html(df) + + serve( + html, ip=ip, port=port, + ) diff --git a/texthero/visualization_server/_server.py b/texthero/visualization_server/_server.py new file mode 100644 index 00000000..f27e9208 --- /dev/null +++ b/texthero/visualization_server/_server.py @@ -0,0 +1,93 @@ +# this file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_server.py +# Copyright (c) 2013, Jake Vanderplas +""" +Simple server used to serve our visualizations in a web browser. +""" +from http import server +import sys +import threading +import webbrowser +import socket + + +def generate_handler(html): + """ + Generate handler that only + serves our generated html. + """ + + class MyHandler(server.BaseHTTPRequestHandler): + def do_GET(self): + """Respond to a GET request.""" + if self.path == "/": + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + self.wfile.write(html.encode()) + else: + self.send_error(404) + + return MyHandler + + +def find_open_port(ip, port, n=50): + """ + Find an open port near the specified port. + """ + + ports = [port + i for i in range(n)] + + for port in ports: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + result = s.connect_ex((ip, port)) + s.close() + if result != 0: + return port + + raise ValueError("no open ports found") + + +def serve( + html, ip="127.0.0.1", port=8888, open_browser=True, +): + """ + Start a server serving the given HTML, and (optionally) open a + browser. + + Parameters + ---------- + html : string + HTML to serve + + ip : string (default = '127.0.0.1') + ip address at which the HTML will be served. + + port : int (default = 8888) + the port at which to serve the HTML + + open_browser : bool (optional) + if True (default), then open a web browser to the given HTML + """ + + port = find_open_port(ip, port, n=50) + Handler = generate_handler(html) + + srvr = server.HTTPServer((ip, port), Handler) + + # Start the server + print("Serving to http://{0}:{1}/ [Ctrl-C to exit]".format(ip, port)) + sys.stdout.flush() + + if open_browser: + # Use a thread to open a web browser pointing to the server + def b(): + return webbrowser.open("http://{0}:{1}".format(ip, port)) + + threading.Thread(target=b).start() + + try: + srvr.serve_forever() + except (KeyboardInterrupt, SystemExit): + print("\nStopping Server...") + + srvr.server_close() From 4aa066e34df91d71e12298fce9391a30a1e4c832 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 19:38:55 +0200 Subject: [PATCH 04/12] fix HTML-return --- texthero/visualization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 530dbdd7..a697fc40 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -375,7 +375,7 @@ def show_dataframe( RuntimeWarning, ) - _display_df_notebook(df) + return _display_df_notebook(df) else: _display_df_browser( From a78e991a8e4a7b91d7d07bfdae2f66cc6496dd02 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 19:41:22 +0200 Subject: [PATCH 05/12] try again to fix HTML return --- texthero/visualization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index a697fc40..e417da99 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -365,6 +365,7 @@ def show_dataframe( # (Not a safe check.) try: __IPYTHON__ + from IPython.display import HTML except: warnings.warn( "You do not appear do be inside" @@ -375,7 +376,7 @@ def show_dataframe( RuntimeWarning, ) - return _display_df_notebook(df) + return HTML(_display_df_notebook(df)) else: _display_df_browser( From d6671b75681909f1452bb99887ca09d1e31cc938 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 19:49:16 +0200 Subject: [PATCH 06/12] try another fix --- texthero/visualization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index e417da99..759e62ee 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -376,7 +376,9 @@ def show_dataframe( RuntimeWarning, ) - return HTML(_display_df_notebook(df)) + return HTML( + visualization_server.data_to_html(df) + ) else: _display_df_browser( From 7fc5f7d626056910798e44c22fd698617234b47c Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 20:13:16 +0200 Subject: [PATCH 07/12] another approach --- texthero/visualization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 759e62ee..70f600e1 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -365,7 +365,7 @@ def show_dataframe( # (Not a safe check.) try: __IPYTHON__ - from IPython.display import HTML + import IPython except: warnings.warn( "You do not appear do be inside" @@ -376,8 +376,8 @@ def show_dataframe( RuntimeWarning, ) - return HTML( - visualization_server.data_to_html(df) + return IPython.display.display_html( + visualization_server.data_to_html(df), raw=True ) else: From db0c62fd5b1121050d1cc5f31a484ad3356228c3 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 20:23:39 +0200 Subject: [PATCH 08/12] - --- texthero/visualization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 70f600e1..5d922378 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -376,9 +376,9 @@ def show_dataframe( RuntimeWarning, ) - return IPython.display.display_html( + return IPython.display.display(IPython.display.HTML( visualization_server.data_to_html(df), raw=True - ) + )) else: _display_df_browser( From a43d76e5c9b29d5d0acb25287b1b229708588be0 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 20:23:50 +0200 Subject: [PATCH 09/12] -- --- texthero/visualization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 5d922378..3b3052ac 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -376,9 +376,9 @@ def show_dataframe( RuntimeWarning, ) - return IPython.display.display(IPython.display.HTML( - visualization_server.data_to_html(df), raw=True - )) + return IPython.display.display( + IPython.display.HTML(visualization_server.data_to_html(df), raw=True) + ) else: _display_df_browser( From 27dd0811e2be6d066ca8cef2129df3b7477a3af9 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 20:28:44 +0200 Subject: [PATCH 10/12] another try --- texthero/visualization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index 3b3052ac..cc438259 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -377,7 +377,7 @@ def show_dataframe( ) return IPython.display.display( - IPython.display.HTML(visualization_server.data_to_html(df), raw=True) + IPython.display.HTML(visualization_server.data_to_html(df)) ) else: From 2fc46a04a4066a289f027b60f92aaca2e377541f Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 20:32:24 +0200 Subject: [PATCH 11/12] final fix for colab --- texthero/visualization_server/_display.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/texthero/visualization_server/_display.py b/texthero/visualization_server/_display.py index f00a02a0..51c80604 100644 --- a/texthero/visualization_server/_display.py +++ b/texthero/visualization_server/_display.py @@ -83,29 +83,6 @@ def data_to_html(df): return template.render(df_json=df_json) -def _display_df_notebook(df): - """ - Display visualization of DataFrame `df` - in IPython notebook via the HTML display hook. - - Returns the IPython HTML rich display of the visualization. - - """ - # import here, in case users don't have requirements installed - try: - from IPython.display import HTML - except: - raise ValueError( - "You do not appear do be inside" - " a Jupyter Notebook. Set" - " notebook=False to show the visualization." - ) - - html = data_to_html(df) - - return HTML(html) - - def _display_df_browser( df, ip="127.0.0.1", port=8888, ): From 27d78a5e73624290fa24a57acb2b891014e280ae Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 6 Sep 2020 20:41:54 +0200 Subject: [PATCH 12/12] remove import for deleted module --- texthero/visualization.py | 2 +- texthero/visualization_server/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/texthero/visualization.py b/texthero/visualization.py index cc438259..9abb02c6 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -12,7 +12,7 @@ from texthero import preprocessing from texthero._types import TextSeries, InputSeries -from texthero.visualization_server import _display_df_notebook, _display_df_browser +from texthero.visualization_server import _display_df_browser from texthero import visualization_server from matplotlib.colors import LinearSegmentedColormap as lsg diff --git a/texthero/visualization_server/__init__.py b/texthero/visualization_server/__init__.py index f9dbbc61..66be52c4 100644 --- a/texthero/visualization_server/__init__.py +++ b/texthero/visualization_server/__init__.py @@ -4,6 +4,6 @@ """ from ._display import * -from ._display import _display_df_browser, _display_df_notebook +from ._display import _display_df_browser from ._server import *