Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement show_dataframe #177

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
env: PATH=/c/Python38:/c/Python38/Scripts:$PATH
install:
- pip3 install --upgrade pip # all three OSes agree about 'pip3'
- pip3 install black
- pip3 install black==19.10b0
- pip3 install ".[dev]" .
# 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows
# 'python3' is a 'command not found' error on Windows but 'py' works on Windows only
Expand Down
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ install_requires =
unidecode>=1.1.1
gensim>=3.6.0
matplotlib>=3.1.0
jinja2>=2.11.1

# TODO pick the correct version.
[options.extras_require]
dev =
black>=19.10b0
black==19.10b0
pytest>=4.0.0
Sphinx>=3.0.3
sphinx-markdown-builder>=0.5.4
Expand Down
8 changes: 8 additions & 0 deletions tests/test_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,11 @@ def test_top_words_digits_punctuation(self):
def test_wordcloud(self):
s = pd.Series("one two three")
self.assertEqual(visualization.wordcloud(s), None)

"""
Test show_dataframe.
"""

def test_show_dataframe(self):
df = pd.DataFrame([["Test", 0.5], ["ja", 0.3]])
self.assertIsNotNone(visualization.show_dataframe(df, return_HTML=True))
3 changes: 3 additions & 0 deletions texthero/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@
from .nlp import *

from . import stopwords

from . import visualization_server
from .visualization_server import *
80 changes: 79 additions & 1 deletion texthero/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@
Visualize insights and statistics of a text-based Pandas DataFrame.
"""

import os
import pandas as pd
import numpy as np
import plotly.express as px
import warnings

from wordcloud import WordCloud

from texthero import preprocessing
from texthero._types import TextSeries, InputSeries
import string
from texthero.visualization_server import _display_df_browser
from texthero import visualization_server

from matplotlib.colors import LinearSegmentedColormap as lsg
import matplotlib.pyplot as plt

from collections import Counter
import string


def scatterplot(
Expand Down Expand Up @@ -306,3 +310,77 @@ def top_words(s: TextSeries, normalize=False) -> pd.Series:
.explode() # one word for each line
.value_counts(normalize=normalize)
)


def show_dataframe(
df: pd.DataFrame, notebook=True, ip="127.0.0.1", port=8888, return_HTML=False
):
"""
Visualize a Pandas DataFrame.

To embed the visualization inside
a Jupyter Notebook (e.g. Google Colab, Kaggle),
set `notebook=True` (default). To visualize
in a separate browser window, set it to
False.

Parameters
----------
df : pd.DataFrame
The DataFrame to visualize.

notebook : bool, default to True
Whether to visualize inside the
current Jupyter Notebook or in
a separate browser window.

ip : string, default = '127.0.0.1'
The ip address used for the local server.
Ignored when notebook is set to True.

port : int, default = 8888
The port number to use for the local server.
If already in use,
a nearby open port will be found.
Ignored when notebook is set to True.

return_HTML : bool, default to False
Whether to return the generated HTML
instead of visualizing it.

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> df = pd.read_csv("https://raw.githubusercontent.com/jbesomi/texthero/master/dataset/bbcsport.csv") # doctest: +SKIP
>>> hero.show_dataframe(df) # doctest: +SKIP

"""

if return_HTML:
return visualization_server.data_to_html(df)

if notebook:
# Try to check whether the user is in a notebook.
# (Not a safe check.)
try:
__IPYTHON__
import IPython
except:
warnings.warn(
"You do not appear do be inside"
" a Jupyter Notebook. Set"
" notebook=False to show the visualization."
" If you can already see the visualization, "
" ignore this warning.",
RuntimeWarning,
)

return IPython.display.display(
IPython.display.HTML(visualization_server.data_to_html(df))
)

else:
_display_df_browser(
df, ip=ip, port=port,
)
9 changes: 9 additions & 0 deletions texthero/visualization_server/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Submodule for our more complex visualizations that
run interactively.
"""

from ._display import *
from ._display import _display_df_browser

from ._server import *
112 changes: 112 additions & 0 deletions texthero/visualization_server/_display.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
Module to display our visualizations interactively
inside a Notebook / Browser.

This file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_display.py
Copyright (c) 2013, Jake Vanderplas.
It was adapted for pyLDAvis by Ben Mabey.
It was then adapted for Texthero.
"""

import json
import jinja2
from ._server import serve


# Our HTML template. We use jinja2
# to programmatically insert the
# data we want to visualize
# in the function data_to_html
# below.
HTML_TEMPLATE = jinja2.Template(
r"""
<!DOCTYPE html>
<html lang="en">
<head>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
<link href="https://cdn.datatables.net/1.10.21/css/jquery.dataTables.min.css" rel="stylesheet">
</head>

<body>
<div class="container">
<div class="header">
<h5 class="text-muted"></h3>
</div>

<div>
<div id="tablediv"></div>
</div>
</div>
<script src="https://code.jquery.com/jquery-3.5.1.js" type="text/javascript"></script>
<script src="https://cdn.datatables.net/1.10.21/js/jquery.dataTables.min.js" type="text/javascript"></script>
<script src="https://cdn.datatables.net/plug-ins/1.10.21/dataRender/ellipsis.js" type="text/javascript"></script>
<script type="text/javascript">



$(document).ready(function () {
$("#tablediv").html({{ df_json }});
var table = $("#tableID").DataTable({
columnDefs: [ {
targets: 0,
render: $.fn.dataTable.render.ellipsis(260, true, true)
} ]
});
});

</script>
</body>

</html>
"""
)


def data_to_html(df):
"""
Output HTML with embedded visualization
of the DataFrame df.

"""
template = HTML_TEMPLATE

# Create JSON from DataFrame with correct classes/ID for visualization.
df_json = json.dumps(
df.to_html(
classes='table table-hover" id = "tableID',
index=False,
justify="left",
border=0,
)
)

return template.render(df_json=df_json)


def _display_df_browser(
df, ip="127.0.0.1", port=8888,
):
"""
Display visualization of DataFrame `df`
in local browser.

Parameters
----------
df : pd.DataFrame
The DataFrame to visualize.

ip : string, default = '127.0.0.1'
The ip address used for the local server

port : int, default = 8888
The port number to use for the local server.
If already in use,
a nearby open port will be found.

"""

html = data_to_html(df)

serve(
html, ip=ip, port=port,
)
93 changes: 93 additions & 0 deletions texthero/visualization_server/_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# this file is largely based on https://github.com/jakevdp/mpld3/blob/master/mpld3/_server.py
# Copyright (c) 2013, Jake Vanderplas
"""
Simple server used to serve our visualizations in a web browser.
"""
from http import server
import sys
import threading
import webbrowser
import socket


def generate_handler(html):
"""
Generate handler that only
serves our generated html.
"""

class MyHandler(server.BaseHTTPRequestHandler):
def do_GET(self):
"""Respond to a GET request."""
if self.path == "/":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(html.encode())
else:
self.send_error(404)

return MyHandler


def find_open_port(ip, port, n=50):
"""
Find an open port near the specified port.
"""

ports = [port + i for i in range(n)]

for port in ports:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = s.connect_ex((ip, port))
s.close()
if result != 0:
return port

raise ValueError("no open ports found")


def serve(
html, ip="127.0.0.1", port=8888, open_browser=True,
):
"""
Start a server serving the given HTML, and (optionally) open a
browser.

Parameters
----------
html : string
HTML to serve

ip : string (default = '127.0.0.1')
ip address at which the HTML will be served.

port : int (default = 8888)
the port at which to serve the HTML

open_browser : bool (optional)
if True (default), then open a web browser to the given HTML
"""

port = find_open_port(ip, port, n=50)
Handler = generate_handler(html)

srvr = server.HTTPServer((ip, port), Handler)

# Start the server
print("Serving to http://{0}:{1}/ [Ctrl-C to exit]".format(ip, port))
sys.stdout.flush()

if open_browser:
# Use a thread to open a web browser pointing to the server
def b():
return webbrowser.open("http://{0}:{1}".format(ip, port))

threading.Thread(target=b).start()

try:
srvr.serve_forever()
except (KeyboardInterrupt, SystemExit):
print("\nStopping Server...")

srvr.server_close()