-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblog_scrapper.py
25 lines (24 loc) · 914 Bytes
/
blog_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import requests
from bs4 import BeautifulSoup as soup
import json
import re
from nltk.corpus import stopwords
import copy
def get_blog_text(url):
stats = dict()
page = requests.get(url)
html = soup(page.content,'html.parser')
text = ''
for x in html.find_all('script'):
if 'window.__APOLLO_STATE__' in x.text:
data = json.loads(x.text.split("__ = ")[1])
for y in data.keys():
if 'Paragraph' in y:
text+=' {}'.format(data[y]['text'])
original_text = copy.deepcopy(text)
stats.update({'total words':len(text.split(' '))})
text = re.sub('[^a-zA-Z]+',' ', text)
text = [word.strip() for word in text.split(' ') if word not in stopwords.words()]
text = [x for x in text if len(x)>2]
stats.update({'stop words %':(100*len(text))//stats['total words']})
return text, stats, original_text