diff --git a/.gitignore b/.gitignore
index 12f2dbf..e484a1b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,7 @@
**/__pycache__
**/.ipynb_checkpoints/
**/*.swp
+**/dist/
+**/build/
+**/facets_overview.egg-info/
+
diff --git a/README.md b/README.md
index b1ffaa9..fdcf938 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,8 @@ Key aspects of the visualization are outlier detection and distribution comparis
Interesting values (such as a high proportion of missing data, or very different distributions of a feature across multiple datasets) are highlighted in red.
Features can be sorted by values of interest such as the number of missing values or the skew between the different datasets.
+The python code to generate the statistics for visualization can be installed through `pip install facets-overview`.
+
Details about Overview usage can be found in its [README](./facets_overview/README.md).
## Facets Dive
diff --git a/colab_facets.ipynb b/colab_facets.ipynb
index f581cda..c9a1094 100644
--- a/colab_facets.ipynb
+++ b/colab_facets.ipynb
@@ -1,218 +1,132 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
"colab": {
- "autoexec": {
- "startup": false,
- "wait_interval": 0
- }
+ "name": "Facets Dive and Overview Colab Example",
+ "version": "0.3.2",
+ "provenance": []
},
- "colab_type": "code",
- "id": "blPpZw5R3Bb4"
- },
- "outputs": [],
- "source": [
- "# Load UCI census train and test data into dataframes.\n",
- "import pandas as pd\n",
- "features = [\"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital Status\",\n",
- " \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital Gain\", \"Capital Loss\",\n",
- " \"Hours per week\", \"Country\", \"Target\"]\n",
- "train_data = pd.read_csv(\n",
- " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
- " names=features,\n",
- " sep=r'\\s*,\\s*',\n",
- " engine='python',\n",
- " na_values=\"?\")\n",
- "test_data = pd.read_csv(\n",
- " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
- " names=features,\n",
- " sep=r'\\s*,\\s*',\n",
- " skiprows=[0],\n",
- " engine='python',\n",
- " na_values=\"?\")"
- ]
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ }
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "autoexec": {
- "startup": false,
- "wait_interval": 0
- },
- "base_uri": "https://localhost:8080/",
- "height": 617
- },
- "colab_type": "code",
- "executionInfo": {
- "elapsed": 4749,
- "status": "ok",
- "timestamp": 1532523415979,
- "user": {
- "displayName": "James Wexler",
- "photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg",
- "userId": "104529426628068202733"
- },
- "user_tz": 240
- },
- "id": "XtOzRy8Z3M36",
- "outputId": "9efa442d-1e11-416e-d57f-e57b6e7e16e4"
- },
- "outputs": [],
- "source": [
- "\n",
- "# Display the Dive visualization for the training data.\n",
- "from IPython.core.display import display, HTML\n",
- "\n",
- "jsonstr = train_data.to_json(orient='records')\n",
- "HTML_TEMPLATE = \"\"\"\n",
- " \n",
- " \n",
- " \n",
- " \"\"\"\n",
- "html = HTML_TEMPLATE.format(jsonstr=jsonstr)\n",
- "display(HTML(html))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "autoexec": {
- "startup": false,
- "wait_interval": 0
- },
- "base_uri": "https://localhost:8080/",
- "height": 125
- },
- "colab_type": "code",
- "executionInfo": {
- "elapsed": 3967,
- "status": "ok",
- "timestamp": 1532522957138,
- "user": {
- "displayName": "James Wexler",
- "photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg",
- "userId": "104529426628068202733"
- },
- "user_tz": 240
+ "cells": [
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "M7JcESAhpKG-",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "#@title Install the facets_overview pip package.\n",
+ "!pip install facets-overview"
+ ],
+ "execution_count": 0,
+ "outputs": []
},
- "id": "B22HH9kyeyQd",
- "outputId": "323a4d74-8d40-480b-ac9f-58ccf7a4f990"
- },
- "outputs": [],
- "source": [
- "# Clone the facets github repo to get access to the python feature stats generation code\n",
- "!git clone https://github.com/pair-code/facets.git"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "autoexec": {
- "startup": false,
- "wait_interval": 0
- }
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "blPpZw5R3Bb4",
+ "colab": {}
+ },
+ "source": [
+ "# Load UCI census train and test data into dataframes.\n",
+ "import pandas as pd\n",
+ "features = [\"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital Status\",\n",
+ " \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital Gain\", \"Capital Loss\",\n",
+ " \"Hours per week\", \"Country\", \"Target\"]\n",
+ "train_data = pd.read_csv(\n",
+ " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
+ " names=features,\n",
+ " sep=r'\\s*,\\s*',\n",
+ " engine='python',\n",
+ " na_values=\"?\")\n",
+ "test_data = pd.read_csv(\n",
+ " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
+ " names=features,\n",
+ " sep=r'\\s*,\\s*',\n",
+ " skiprows=[0],\n",
+ " engine='python',\n",
+ " na_values=\"?\")"
+ ],
+ "execution_count": 0,
+ "outputs": []
},
- "colab_type": "code",
- "id": "mjv5Kr1Mflq7"
- },
- "outputs": [],
- "source": [
- "# Add the path to the feature stats generation code.\n",
- "import sys\n",
- "sys.path.insert(0, '/content/facets/facets_overview/python/')\n",
- "\n",
- "# Create the feature stats for the datasets and stringify it.\n",
- "import base64\n",
- "from generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n",
- "\n",
- "gfsg = GenericFeatureStatisticsGenerator()\n",
- "proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},\n",
- " {'name': 'test', 'table': test_data}])\n",
- "protostr = base64.b64encode(proto.SerializeToString()).decode(\"utf-8\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "autoexec": {
- "startup": false,
- "wait_interval": 0
- },
- "base_uri": "https://localhost:8080/",
- "height": 1028
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "XtOzRy8Z3M36",
+ "colab": {}
+ },
+ "source": [
+ "\n",
+ "# Display the Dive visualization for the training data.\n",
+ "from IPython.core.display import display, HTML\n",
+ "\n",
+ "jsonstr = train_data.to_json(orient='records')\n",
+ "HTML_TEMPLATE = \"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " \"\"\"\n",
+ "html = HTML_TEMPLATE.format(jsonstr=jsonstr)\n",
+ "display(HTML(html))"
+ ],
+ "execution_count": 0,
+ "outputs": []
},
- "colab_type": "code",
- "executionInfo": {
- "elapsed": 369,
- "status": "ok",
- "timestamp": 1532523370507,
- "user": {
- "displayName": "James Wexler",
- "photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg",
- "userId": "104529426628068202733"
- },
- "user_tz": 240
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "mjv5Kr1Mflq7",
+ "colab": {}
+ },
+ "source": [
+ "# Create the feature stats for the datasets and stringify it.\n",
+ "import base64\n",
+ "from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n",
+ "\n",
+ "gfsg = GenericFeatureStatisticsGenerator()\n",
+ "proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},\n",
+ " {'name': 'test', 'table': test_data}])\n",
+ "protostr = base64.b64encode(proto.SerializeToString()).decode(\"utf-8\")"
+ ],
+ "execution_count": 0,
+ "outputs": []
},
- "id": "b7zs2p2_goJa",
- "outputId": "22e211df-972f-49b9-f271-75e0d4ba68ee"
- },
- "outputs": [],
- "source": [
- "# Display the facets overview visualization for this data\n",
- "from IPython.core.display import display, HTML\n",
- "\n",
- "HTML_TEMPLATE = \"\"\"\n",
- " \n",
- " \n",
- " \n",
- " \"\"\"\n",
- "html = HTML_TEMPLATE.format(protostr=protostr)\n",
- "display(HTML(html))"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "default_view": {},
- "name": "Facets Dive and Overview Colab Example",
- "provenance": [],
- "version": "0.3.2",
- "views": {}
- },
- "kernelspec": {
- "display_name": "Python 2",
- "language": "python",
- "name": "python2"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 2
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython2",
- "version": "2.7.16"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab_type": "code",
+ "id": "b7zs2p2_goJa",
+ "colab": {}
+ },
+ "source": [
+ "# Display the facets overview visualization for this data\n",
+ "from IPython.core.display import display, HTML\n",
+ "\n",
+ "HTML_TEMPLATE = \"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " \"\"\"\n",
+ "html = HTML_TEMPLATE.format(protostr=protostr)\n",
+ "display(HTML(html))"
+ ],
+ "execution_count": 0,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/facets_dive/Dive_demo.ipynb b/facets_dive/Dive_demo.ipynb
index b44a8a1..da751b8 100644
--- a/facets_dive/Dive_demo.ipynb
+++ b/facets_dive/Dive_demo.ipynb
@@ -37,7 +37,7 @@
{
"data": {
"text/html": [
- "\n",
+ "\n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
"