Skip to content

Commit 0008c13

Browse files
committed
added additional snippets
1 parent 513228f commit 0008c13

26 files changed

+7612
-341
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
{
2+
"metadata": {
3+
"name": "",
4+
"signature": "sha256:6605f03c0c5c2e7a97fa8e4748b40b3a86d97c85a0ae1c56a944c66dc27ef69e"
5+
},
6+
"nbformat": 3,
7+
"nbformat_minor": 0,
8+
"worksheets": [
9+
{
10+
"cells": [
11+
{
12+
"cell_type": "markdown",
13+
"metadata": {},
14+
"source": [
15+
"# Cleaning Text\n",
16+
"\n",
17+
"This snippit was written by [Chris R. Albon](http://www.chrisralbon.com/) and is part of his collection of [well-documented Python snippits](https://github.com/chrisalbon/code_py). All code is written in Python 3 in iPython notebook and offered under the [Creative Commons Attribution-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-sa/4.0/).\n",
18+
"\n",
19+
"- Based on [http://nbviewer.ipython.org/gist/rjweiss/7577004](http://nbviewer.ipython.org/gist/rjweiss/7577004)"
20+
]
21+
},
22+
{
23+
"cell_type": "markdown",
24+
"metadata": {},
25+
"source": [
26+
"## Create some raw text"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"collapsed": false,
32+
"input": [
33+
"# Create a list of three strings.\n",
34+
"incoming_reports = [\"We are attacking on their left flank but are losing many men.\", \n",
35+
" \"We cannot see the enemy army. Nothing else to report.\", \n",
36+
" \"We are ready to attack but are waiting for your orders.\"]"
37+
],
38+
"language": "python",
39+
"metadata": {},
40+
"outputs": [],
41+
"prompt_number": 3
42+
},
43+
{
44+
"cell_type": "markdown",
45+
"metadata": {},
46+
"source": [
47+
"## Seperate by word"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"collapsed": false,
53+
"input": [
54+
"# import word tokenizer\n",
55+
"from nltk.tokenize import word_tokenize\n",
56+
"\n",
57+
"# Apply word_tokenize to each element of the list called incoming_reports\n",
58+
"tokenized_reports = [word_tokenize(report) for report in incoming_reports]\n",
59+
"\n",
60+
"# View tokenized_reports\n",
61+
"tokenized_reports"
62+
],
63+
"language": "python",
64+
"metadata": {},
65+
"outputs": [
66+
{
67+
"metadata": {},
68+
"output_type": "pyout",
69+
"prompt_number": 9,
70+
"text": [
71+
"[['We',\n",
72+
" 'are',\n",
73+
" 'attacking',\n",
74+
" 'on',\n",
75+
" 'their',\n",
76+
" 'left',\n",
77+
" 'flank',\n",
78+
" 'but',\n",
79+
" 'are',\n",
80+
" 'losing',\n",
81+
" 'many',\n",
82+
" 'men',\n",
83+
" '.'],\n",
84+
" ['We',\n",
85+
" 'can',\n",
86+
" 'not',\n",
87+
" 'see',\n",
88+
" 'the',\n",
89+
" 'enemy',\n",
90+
" 'army',\n",
91+
" '.',\n",
92+
" 'Nothing',\n",
93+
" 'else',\n",
94+
" 'to',\n",
95+
" 'report',\n",
96+
" '.'],\n",
97+
" ['We',\n",
98+
" 'are',\n",
99+
" 'ready',\n",
100+
" 'to',\n",
101+
" 'attack',\n",
102+
" 'but',\n",
103+
" 'are',\n",
104+
" 'waiting',\n",
105+
" 'for',\n",
106+
" 'your',\n",
107+
" 'orders',\n",
108+
" '.']]"
109+
]
110+
}
111+
],
112+
"prompt_number": 9
113+
},
114+
{
115+
"cell_type": "code",
116+
"collapsed": false,
117+
"input": [
118+
"# Import regex\n",
119+
"import re\n",
120+
"\n",
121+
"# Import string\n",
122+
"import string\n",
123+
"\n",
124+
"\n",
125+
"regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
126+
"\n",
127+
"tokenized_reports_no_punctuation = []\n",
128+
"\n",
129+
"for review in tokenized_reports:\n",
130+
" \n",
131+
" new_review = []\n",
132+
" for token in review: \n",
133+
" new_token = regex.sub(u'', token)\n",
134+
" if not new_token == u'':\n",
135+
" new_review.append(new_token)\n",
136+
" \n",
137+
" tokenized_reports_no_punctuation.append(new_review)\n",
138+
" \n",
139+
"tokenized_reports_no_punctuation"
140+
],
141+
"language": "python",
142+
"metadata": {},
143+
"outputs": [
144+
{
145+
"metadata": {},
146+
"output_type": "pyout",
147+
"prompt_number": 12,
148+
"text": [
149+
"[['We',\n",
150+
" 'are',\n",
151+
" 'attacking',\n",
152+
" 'on',\n",
153+
" 'their',\n",
154+
" 'left',\n",
155+
" 'flank',\n",
156+
" 'but',\n",
157+
" 'are',\n",
158+
" 'losing',\n",
159+
" 'many',\n",
160+
" 'men'],\n",
161+
" ['We',\n",
162+
" 'can',\n",
163+
" 'not',\n",
164+
" 'see',\n",
165+
" 'the',\n",
166+
" 'enemy',\n",
167+
" 'army',\n",
168+
" 'Nothing',\n",
169+
" 'else',\n",
170+
" 'to',\n",
171+
" 'report'],\n",
172+
" ['We',\n",
173+
" 'are',\n",
174+
" 'ready',\n",
175+
" 'to',\n",
176+
" 'attack',\n",
177+
" 'but',\n",
178+
" 'are',\n",
179+
" 'waiting',\n",
180+
" 'for',\n",
181+
" 'your',\n",
182+
" 'orders']]"
183+
]
184+
}
185+
],
186+
"prompt_number": 12
187+
},
188+
{
189+
"cell_type": "markdown",
190+
"metadata": {},
191+
"source": [
192+
"## Remove filler words"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"collapsed": false,
198+
"input": [
199+
"from nltk.corpus import stopwords\n",
200+
"\n",
201+
"tokenized_reports_no_stopwords = []\n",
202+
"for report in tokenized_reports_no_punctuation:\n",
203+
" new_term_vector = []\n",
204+
" for word in report:\n",
205+
" if not word in stopwords.words('english'):\n",
206+
" new_term_vector.append(word)\n",
207+
" tokenized_reports_no_stopwords.append(new_term_vector)\n",
208+
" \n",
209+
"tokenized_reports_no_stopwords"
210+
],
211+
"language": "python",
212+
"metadata": {},
213+
"outputs": [
214+
{
215+
"metadata": {},
216+
"output_type": "pyout",
217+
"prompt_number": 14,
218+
"text": [
219+
"[['We', 'attacking', 'left', 'flank', 'losing', 'many', 'men'],\n",
220+
" ['We', 'see', 'enemy', 'army', 'Nothing', 'else', 'report'],\n",
221+
" ['We', 'ready', 'attack', 'waiting', 'orders']]"
222+
]
223+
}
224+
],
225+
"prompt_number": 14
226+
},
227+
{
228+
"cell_type": "code",
229+
"collapsed": false,
230+
"input": [],
231+
"language": "python",
232+
"metadata": {},
233+
"outputs": []
234+
}
235+
],
236+
"metadata": {}
237+
}
238+
]
239+
}

0 commit comments

Comments
 (0)