-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththread_main.py
102 lines (83 loc) · 3.79 KB
/
thread_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
from requests import Session
import re
import gender_guesser.detector as gender
import pandas as pd
from beaupy.spinners import *
from concurrent.futures import ThreadPoolExecutor, as_completed
from plot import process_and_save_plot
def fetch_articles_links(session: Session, page: int, headers: dict) -> set:
root = 'https://cerncourier.com'
url_rev = f'{root}/l/reviews'
url = f'{url_rev}/page/{page}'
req = session.get(url, headers=headers) # Note: Use session.get instead of requests.get
soup = BeautifulSoup(req.text, "html.parser")
all_links = {link['href'] for link in soup.find_all('a', href=True)}
specific_links = {link for link in all_links if link.startswith('https://cerncourier.com/a/')}
return specific_links
def fetch_article_info(link: str, headers: dict) -> dict:
req = requests.get(link, headers=headers)
soup = BeautifulSoup(req.text, "html.parser")
article_info: dict = {}
try:
article_info['title'] = soup.find("h1", class_="single-header__heading").get_text(separator=" ").strip()
except:
article_info['title'] = "Not Found"
try:
article_info['date'] = soup.find("div", class_="single-header__meta").get_text(separator=" ").strip()
except:
article_info['date'] = "Not Found"
author_div = soup.find("div", class_="author-byline")
if author_div is not None:
article_info['author'] = author_div.get_text(separator=" ").strip()
else:
article_info['author'] = "Not Found"
print(article_info['author'])
return article_info
def determine_gender(name: str, detector: gender.Detector) -> str:
if name == "Not Found":
return "unknown"
else:
return detector.get_gender(name.split()[0])
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Accept-Encoding': 'utf-8'
}
articles_urls = set()
articles_info: list = []
detector = gender.Detector()
with requests.Session() as session:
spinner = Spinner(DOTS, '[green]Waiting for HTTP replies...[/green]')
spinner.start()
# Use ThreadPoolExecutor to fetch article links in parallel
with ThreadPoolExecutor() as executor:
futures = [executor.submit(fetch_articles_links, session, page, headers) for page in range(1, 82)]
for future in as_completed(futures):
specific_links = future.result()
articles_urls |= specific_links # Merge the sets using |=
spinner.stop()
# Use ThreadPoolExecutor to fetch article information in parallel
spinner = Spinner(DOTS, '[green]Waiting for article info retrieval...[/green]')
spinner.start()
with ThreadPoolExecutor() as executor:
futures = [executor.submit(fetch_article_info, link, headers) for link in articles_urls]
for future in as_completed(futures):
article_info = future.result()
articles_info.append(article_info)
spinner.stop()
# Process gender analysis
spinner = Spinner(DOTS, '[green]Waiting for gender analysis... Hold on...[/green]')
spinner.start()
for article_info in articles_info:
article_info['gender'] = determine_gender(article_info['author'], detector)
spinner.stop()
df = pd.DataFrame(articles_info)
df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')
# Sort the DataFrame by the 'Date' column
df_sorted = df.sort_values(by='date')
df_sorted = df_sorted.drop(df.columns[[0, 1]], axis=1)
df_sorted.to_csv("reviewers_gender.csv")
process_and_save_plot(input_csv="reviewers_gender.csv")