diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d830497 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +venv/ +*.vec +*.vec.zip +*.ann +.ipynb_checkpoints/* + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..753e05a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,53 @@ +annoy==1.16.3 +appnope==0.1.0 +attrs==19.3.0 +backcall==0.1.0 +bleach==3.1.5 +certifi==2020.4.5.2 +chardet==3.0.4 +decorator==4.4.2 +defusedxml==0.6.0 +entrypoints==0.3 +idna==2.9 +importlib-metadata==1.6.1 +ipykernel==5.3.0 +ipython==7.15.0 +ipython-genutils==0.2.0 +jedi==0.17.0 +Jinja2==2.11.2 +json5==0.9.5 +jsonschema==3.2.0 +jupyter-client==6.1.3 +jupyter-core==4.6.3 +jupyterlab==2.1.4 +jupyterlab-server==1.1.5 +MarkupSafe==1.1.1 +mistune==0.8.4 +nbconvert==5.6.1 +nbformat==5.0.6 +notebook==6.0.3 +numpy==1.18.5 +packaging==20.4 +pandocfilters==1.4.2 +parso==0.7.0 +pexpect==4.8.0 +pickleshare==0.7.5 +prometheus-client==0.8.0 +prompt-toolkit==3.0.5 +ptyprocess==0.6.0 +Pygments==2.6.1 +pyparsing==2.4.7 +pyrsistent==0.16.0 +python-dateutil==2.8.1 +pyzmq==19.0.1 +requests==2.23.0 +Send2Trash==1.5.0 +six==1.15.0 +terminado==0.8.3 +testpath==0.4.4 +tornado==6.0.4 +traitlets==4.3.3 +urllib3==1.25.9 +wcwidth==0.2.3 +webencodings==0.5.1 +zipp==3.1.0 diff --git a/thesaurus-demo.ipynb b/thesaurus-demo.ipynb new file mode 100644 index 0000000..83a4cb2 --- /dev/null +++ b/thesaurus-demo.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building a thesarus with Data Science\n", + "\n", + "We're going to build an intelligent thesaurus using a publicly availible word embedding dataset. \n", + "\n", + "Approximate Nearest Neighbors will be used to cluster and neighborhood the word embeddings for realtime lookup on our pre-trained corpus." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Dict, IO\n", + "\n", + "from annoy import AnnoyIndex\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training New Embeddings\n", + "\n", + "First we have to get our word vectors from [FastText](https://fasttext.cc/docs/en/english-vectors.html). They have a few pretrained corpora avalible. \n", + "\n", + "If you have a few million lines of unstructured text sitting around, you can [train your own as well](https://fasttext.cc/docs/en/unsupervised-tutorial.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip\n", + "! unzip wiki-news-300d-1M.vec.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Lets take a peek at one file just so we know what we're working with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! head -n 2 wiki-news-300d-1M.vec" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a standard format for embeddings. The first line contains two pieces of metadata: \n", + "* The number of embeddings, or the length\n", + "* The length of each embedding, or the dimensionality\n", + "\n", + "Subsequent lines contain a word, and space seperated floats comprising it's corresponding vector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Indexing Embeddings\n", + "\n", + "Now that we \"trained\" our vectors, let's neighborhood them using [`Annoy`](https://github.com/spotify/annoy).\n", + "\n", + "For a minimal demo, we'll need the embedding file, model dimensionality, and a mapping of integer IDs to Words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_file_name: str = \"wiki-news-300d-1M.vec\"\n", + "index_map: Dict[int,str] = {}\n", + "word_map: Dict[str, int] = {}\n", + "\n", + "with open(embedding_file_name, \"r\") as embedding_file:\n", + " embedding_length, embedding_dimensions = map(\n", + " int, \n", + " embedding_file.readline().strip().split()\n", + " )\n", + " incomplete_neighborhood = AnnoyIndex(embedding_dimensions, \"euclidean\")\n", + " for num, line in tqdm(enumerate(embedding_file), total=embedding_length):\n", + " tokens = line.strip().split()\n", + " word = tokens[0]\n", + " vector = [float(el) for el in tokens[1:]]\n", + " index_map[num] = word\n", + " word_map[word] = num\n", + " incomplete_neighborhood.add_item(num, vector)\n", + " incomplete_neighborhood.build(10)\n", + " incomplete_neighborhood.save(\"wiki_news_neighbors.ann\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Searching Embeddings\n", + "\n", + "Now that we have an neighborhood of embeddings, we can search for similar embeddings by word." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_neighborhood = AnnoyIndex(embedding_dimensions, \"euclidean\")\n", + "new_neighborhood.load(\"wiki_news_neighbors.ann\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def search_neighborhood(\n", + " query: str, \n", + " n_neighbors: int = 7, \n", + " neighborhood=new_neighborhood, \n", + " idx_map: Dict[int, str] = index_map, \n", + " wrd_map: Dict[str, int] = word_map,\n", + " verbose: bool = False\n", + ") -> List[str]:\n", + " if verbose:\n", + " print(f\"query string: {query}\")\n", + " query_idx = wrd_map[query]\n", + " if verbose:\n", + " print(f\"query index: {query_idx}\")\n", + " neighbor_ids = neighborhood.get_nns_by_item(query_idx, n_neighbors)\n", + " if verbose:\n", + " print(f\"neighbor ids: {neighbor_ids}\")\n", + " neighbors = [idx_map.get(n_id, \"NOT FOUND!\") for n_id in neighbor_ids]\n", + " if verbose:\n", + " print(f\"neighbors: {neighbors}\")\n", + " return neighbors\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search_neighborhood(\"dog\", verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🎆🎊🎆\n", + "\n", + "There you have it a digital thesarus!\n", + "\n", + "### Links:\n", + "\n", + "* [Spotify/Annoy](https://github.com/spotify/annoy)\n", + "* [FastText](https://fasttext.cc/)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}