-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebCrawl.py
35 lines (26 loc) · 831 Bytes
/
webCrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
import codecs
import nltk
import urllib2
import re
from bs4 import BeautifulSoup
import string
from bs4 import SoupStrainer
regex = re.compile(ur"[^\u0900-\u097F\s]+")
with open("hindiurls.txt") as f:
for line in f:
htmlFile = urllib2.urlopen(line)
soup = BeautifulSoup(htmlFile)
text = soup.getText()
test = text.replace(ur"।",".")
x = re.sub("\s+"," ", test)
y = x.split(".")
for i in y:
clean1 = regex.sub("", i)
if not clean1 == "" and not clean1.isspace():
with codecs.open('hindi.txt', 'a', encoding='utf-8') as out:
out.write(re.sub("\s+"," ",clean1) +'\n')
# print clean1.encode('utf-8')
out.close
print(line)
f.close