diff --git a/.gitignore b/.gitignore index 12f2dbf..e484a1b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ **/__pycache__ **/.ipynb_checkpoints/ **/*.swp +**/dist/ +**/build/ +**/facets_overview.egg-info/ + diff --git a/README.md b/README.md index b1ffaa9..fdcf938 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ Key aspects of the visualization are outlier detection and distribution comparis Interesting values (such as a high proportion of missing data, or very different distributions of a feature across multiple datasets) are highlighted in red. Features can be sorted by values of interest such as the number of missing values or the skew between the different datasets. +The python code to generate the statistics for visualization can be installed through `pip install facets-overview`. + Details about Overview usage can be found in its [README](./facets_overview/README.md). ## Facets Dive diff --git a/colab_facets.ipynb b/colab_facets.ipynb index f581cda..c9a1094 100644 --- a/colab_facets.ipynb +++ b/colab_facets.ipynb @@ -1,218 +1,132 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } + "name": "Facets Dive and Overview Colab Example", + "version": "0.3.2", + "provenance": [] }, - "colab_type": "code", - "id": "blPpZw5R3Bb4" - }, - "outputs": [], - "source": [ - "# Load UCI census train and test data into dataframes.\n", - "import pandas as pd\n", - "features = [\"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital Status\",\n", - " \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital Gain\", \"Capital Loss\",\n", - " \"Hours per week\", \"Country\", \"Target\"]\n", - "train_data = pd.read_csv(\n", - " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", - " names=features,\n", - " sep=r'\\s*,\\s*',\n", - " engine='python',\n", - " na_values=\"?\")\n", - "test_data = pd.read_csv(\n", - " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n", - " names=features,\n", - " sep=r'\\s*,\\s*',\n", - " skiprows=[0],\n", - " engine='python',\n", - " na_values=\"?\")" - ] + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "base_uri": "https://localhost:8080/", - "height": 617 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 4749, - "status": "ok", - "timestamp": 1532523415979, - "user": { - "displayName": "James Wexler", - "photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg", - "userId": "104529426628068202733" - }, - "user_tz": 240 - }, - "id": "XtOzRy8Z3M36", - "outputId": "9efa442d-1e11-416e-d57f-e57b6e7e16e4" - }, - "outputs": [], - "source": [ - "\n", - "# Display the Dive visualization for the training data.\n", - "from IPython.core.display import display, HTML\n", - "\n", - "jsonstr = train_data.to_json(orient='records')\n", - "HTML_TEMPLATE = \"\"\"\n", - " \n", - " \n", - " \n", - " \"\"\"\n", - "html = HTML_TEMPLATE.format(jsonstr=jsonstr)\n", - "display(HTML(html))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "base_uri": "https://localhost:8080/", - "height": 125 - }, - "colab_type": "code", - "executionInfo": { - "elapsed": 3967, - "status": "ok", - "timestamp": 1532522957138, - "user": { - "displayName": "James Wexler", - "photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg", - "userId": "104529426628068202733" - }, - "user_tz": 240 + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "M7JcESAhpKG-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#@title Install the facets_overview pip package.\n", + "!pip install facets-overview" + ], + "execution_count": 0, + "outputs": [] }, - "id": "B22HH9kyeyQd", - "outputId": "323a4d74-8d40-480b-ac9f-58ccf7a4f990" - }, - "outputs": [], - "source": [ - "# Clone the facets github repo to get access to the python feature stats generation code\n", - "!git clone https://github.com/pair-code/facets.git" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "blPpZw5R3Bb4", + "colab": {} + }, + "source": [ + "# Load UCI census train and test data into dataframes.\n", + "import pandas as pd\n", + "features = [\"Age\", \"Workclass\", \"fnlwgt\", \"Education\", \"Education-Num\", \"Marital Status\",\n", + " \"Occupation\", \"Relationship\", \"Race\", \"Sex\", \"Capital Gain\", \"Capital Loss\",\n", + " \"Hours per week\", \"Country\", \"Target\"]\n", + "train_data = pd.read_csv(\n", + " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", + " names=features,\n", + " sep=r'\\s*,\\s*',\n", + " engine='python',\n", + " na_values=\"?\")\n", + "test_data = pd.read_csv(\n", + " \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n", + " names=features,\n", + " sep=r'\\s*,\\s*',\n", + " skiprows=[0],\n", + " engine='python',\n", + " na_values=\"?\")" + ], + "execution_count": 0, + "outputs": [] }, - "colab_type": "code", - "id": "mjv5Kr1Mflq7" - }, - "outputs": [], - "source": [ - "# Add the path to the feature stats generation code.\n", - "import sys\n", - "sys.path.insert(0, '/content/facets/facets_overview/python/')\n", - "\n", - "# Create the feature stats for the datasets and stringify it.\n", - "import base64\n", - "from generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n", - "\n", - "gfsg = GenericFeatureStatisticsGenerator()\n", - "proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},\n", - " {'name': 'test', 'table': test_data}])\n", - "protostr = base64.b64encode(proto.SerializeToString()).decode(\"utf-8\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "base_uri": "https://localhost:8080/", - "height": 1028 + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "XtOzRy8Z3M36", + "colab": {} + }, + "source": [ + "\n", + "# Display the Dive visualization for the training data.\n", + "from IPython.core.display import display, HTML\n", + "\n", + "jsonstr = train_data.to_json(orient='records')\n", + "HTML_TEMPLATE = \"\"\"\n", + " \n", + " \n", + " \n", + " \"\"\"\n", + "html = HTML_TEMPLATE.format(jsonstr=jsonstr)\n", + "display(HTML(html))" + ], + "execution_count": 0, + "outputs": [] }, - "colab_type": "code", - "executionInfo": { - "elapsed": 369, - "status": "ok", - "timestamp": 1532523370507, - "user": { - "displayName": "James Wexler", - "photoUrl": "//lh4.googleusercontent.com/-TJBPojJ2kd8/AAAAAAAAAAI/AAAAAAAAABE/YrSFlsiqR80/s50-c-k-no/photo.jpg", - "userId": "104529426628068202733" - }, - "user_tz": 240 + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "mjv5Kr1Mflq7", + "colab": {} + }, + "source": [ + "# Create the feature stats for the datasets and stringify it.\n", + "import base64\n", + "from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator\n", + "\n", + "gfsg = GenericFeatureStatisticsGenerator()\n", + "proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': train_data},\n", + " {'name': 'test', 'table': test_data}])\n", + "protostr = base64.b64encode(proto.SerializeToString()).decode(\"utf-8\")" + ], + "execution_count": 0, + "outputs": [] }, - "id": "b7zs2p2_goJa", - "outputId": "22e211df-972f-49b9-f271-75e0d4ba68ee" - }, - "outputs": [], - "source": [ - "# Display the facets overview visualization for this data\n", - "from IPython.core.display import display, HTML\n", - "\n", - "HTML_TEMPLATE = \"\"\"\n", - " \n", - " \n", - " \n", - " \"\"\"\n", - "html = HTML_TEMPLATE.format(protostr=protostr)\n", - "display(HTML(html))" - ] - } - ], - "metadata": { - "colab": { - "default_view": {}, - "name": "Facets Dive and Overview Colab Example", - "provenance": [], - "version": "0.3.2", - "views": {} - }, - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.16" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "b7zs2p2_goJa", + "colab": {} + }, + "source": [ + "# Display the facets overview visualization for this data\n", + "from IPython.core.display import display, HTML\n", + "\n", + "HTML_TEMPLATE = \"\"\"\n", + " \n", + " \n", + " \n", + " \"\"\"\n", + "html = HTML_TEMPLATE.format(protostr=protostr)\n", + "display(HTML(html))" + ], + "execution_count": 0, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/facets_dive/Dive_demo.ipynb b/facets_dive/Dive_demo.ipynb index b44a8a1..da751b8 100644 --- a/facets_dive/Dive_demo.ipynb +++ b/facets_dive/Dive_demo.ipynb @@ -37,7 +37,7 @@ { "data": { "text/html": [ - "\n", + "\n", " \n", " \n", - " \n", + " \n", " \n", "