springcoil
diff --git a/‎.ipynb_checkpoints/cleaning_text-checkpoint.ipynb
+239 b/‎.ipynb_checkpoints/cleaning_text-checkpoint.ipynb
+239
@@ -0,0 +1,239 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:6605f03c0c5c2e7a97fa8e4748b40b3a86d97c85a0ae1c56a944c66dc27ef69e"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "# Cleaning Text\n",
+      "\n",
+      "This snippit was written by [Chris R. Albon](http://www.chrisralbon.com/) and is part of his collection of [well-documented Python snippits](https://github.com/chrisalbon/code_py). All code is written in Python 3 in iPython notebook and offered under the [Creative Commons Attribution-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-sa/4.0/).\n",
+      "\n",
+      "- Based on [http://nbviewer.ipython.org/gist/rjweiss/7577004](http://nbviewer.ipython.org/gist/rjweiss/7577004)"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Create some raw text"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Create a list of three strings.\n",
+      "incoming_reports = [\"We are attacking on their left flank but are losing many men.\", \n",
+      "               \"We cannot see the enemy army. Nothing else to report.\", \n",
+      "               \"We are ready to attack but are waiting for your orders.\"]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Seperate by word"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# import word tokenizer\n",
+      "from nltk.tokenize import word_tokenize\n",
+      "\n",
+      "# Apply word_tokenize to each element of the list called incoming_reports\n",
+      "tokenized_reports = [word_tokenize(report) for report in incoming_reports]\n",
+      "\n",
+      "# View tokenized_reports\n",
+      "tokenized_reports"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 9,
+       "text": [
+        "[['We',\n",
+        "  'are',\n",
+        "  'attacking',\n",
+        "  'on',\n",
+        "  'their',\n",
+        "  'left',\n",
+        "  'flank',\n",
+        "  'but',\n",
+        "  'are',\n",
+        "  'losing',\n",
+        "  'many',\n",
+        "  'men',\n",
+        "  '.'],\n",
+        " ['We',\n",
+        "  'can',\n",
+        "  'not',\n",
+        "  'see',\n",
+        "  'the',\n",
+        "  'enemy',\n",
+        "  'army',\n",
+        "  '.',\n",
+        "  'Nothing',\n",
+        "  'else',\n",
+        "  'to',\n",
+        "  'report',\n",
+        "  '.'],\n",
+        " ['We',\n",
+        "  'are',\n",
+        "  'ready',\n",
+        "  'to',\n",
+        "  'attack',\n",
+        "  'but',\n",
+        "  'are',\n",
+        "  'waiting',\n",
+        "  'for',\n",
+        "  'your',\n",
+        "  'orders',\n",
+        "  '.']]"
+       ]
+      }
+     ],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Import regex\n",
+      "import re\n",
+      "\n",
+      "# Import string\n",
+      "import string\n",
+      "\n",
+      "\n",
+      "regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
+      "\n",
+      "tokenized_reports_no_punctuation = []\n",
+      "\n",
+      "for review in tokenized_reports:\n",
+      "    \n",
+      "    new_review = []\n",
+      "    for token in review: \n",
+      "        new_token = regex.sub(u'', token)\n",
+      "        if not new_token == u'':\n",
+      "            new_review.append(new_token)\n",
+      "    \n",
+      "    tokenized_reports_no_punctuation.append(new_review)\n",
+      "    \n",
+      "tokenized_reports_no_punctuation"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 12,
+       "text": [
+        "[['We',\n",
+        "  'are',\n",
+        "  'attacking',\n",
+        "  'on',\n",
+        "  'their',\n",
+        "  'left',\n",
+        "  'flank',\n",
+        "  'but',\n",
+        "  'are',\n",
+        "  'losing',\n",
+        "  'many',\n",
+        "  'men'],\n",
+        " ['We',\n",
+        "  'can',\n",
+        "  'not',\n",
+        "  'see',\n",
+        "  'the',\n",
+        "  'enemy',\n",
+        "  'army',\n",
+        "  'Nothing',\n",
+        "  'else',\n",
+        "  'to',\n",
+        "  'report'],\n",
+        " ['We',\n",
+        "  'are',\n",
+        "  'ready',\n",
+        "  'to',\n",
+        "  'attack',\n",
+        "  'but',\n",
+        "  'are',\n",
+        "  'waiting',\n",
+        "  'for',\n",
+        "  'your',\n",
+        "  'orders']]"
+       ]
+      }
+     ],
+     "prompt_number": 12
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "## Remove filler words"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from nltk.corpus import stopwords\n",
+      "\n",
+      "tokenized_reports_no_stopwords = []\n",
+      "for report in tokenized_reports_no_punctuation:\n",
+      "    new_term_vector = []\n",
+      "    for word in report:\n",
+      "        if not word in stopwords.words('english'):\n",
+      "            new_term_vector.append(word)\n",
+      "    tokenized_reports_no_stopwords.append(new_term_vector)\n",
+      "            \n",
+      "tokenized_reports_no_stopwords"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 14,
+       "text": [
+        "[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],\n",
+        " ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],\n",
+        " ['We', 'ready', 'attack', 'waiting', 'orders']]"
+       ]
+      }
+     ],
+     "prompt_number": 14
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}