-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1
34 lines (24 loc) · 824 Bytes
/
1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#importing module to crawl
import codecs
import nltk
import urllib2
import re
from bs4 import BeautifulSoup
import string
from bs4 import SoupStrainer
#soup = BeautifulSoup("https://mr.wikipedia.org/s/2cv")
#text = soup.getText()
#print(text)
f = urllib2.urlopen("https://mr.wikipedia.org/s/2cv")
soup = BeautifulSoup(f)
text = soup.getText()
#removedSpaces = re.sub(r"\s+", " ", text)
#remove ascii characters
#_ascii_letters = re.compile(r'[a-zA-Z0-9]', flags=re.UNICODE)
#removedEnglish = _ascii_letters.sub("", removedSpaces)
#remove the punctuation marks like "?,!,/,\,(,)
#punctutationMarks = re.compile(r'[?|$|.|!|(|)|%|#|}|{|.|;|,]')
#clean = re.sub(r"['_,!\-\"\\\/}{?\()%$*;\[\]:><|=@#+]",'',removedEnglish).strip()
regex = re.compile(ur"[^\u002E-\u097F]+")
cleanest = regex.sub("", text)
print(cleanest)