This repository has been archived by the owner on Jan 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathscrape_users.py
85 lines (82 loc) · 2.86 KB
/
scrape_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Scrape linkedin URLs by using selenium, to simulate the navigation
(click, scroll) and BeautifulSoup to parse the HTML code of the page
Perform a number of queries and log a number of files
for each scraped user.
Write dataset to mongoDB with the scraped data
"""
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from utils import init_driver, get_profile_urls, login,\
print_scraped_data, load_config,\
get_unseen_urls, connect_mongo
from time import sleep
from classes.UserScraper import UserScraper
import argparse
import sys
parser = argparse.ArgumentParser(
description=("Scrape linkedin profiles based on the " +
"queries specified in the conf file")
)
parser.add_argument(
'-c', '--conf',
type=str,
metavar='',
required=True,
help='Specify the path of the configuration file'
)
args = parser.parse_args()
conf = load_config(args.conf)
parameters = conf["parameters"]
credentials = conf["credentials"]
CHROME_PATH = parameters["CHROME_PATH"]
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
QUERIES = parameters["USER_QUERIES"]
N_PAGES = parameters["N_PAGES"]
LINUSERNAME = credentials["LINUSERNAME"]
LINPWD = credentials["LINPWD"]
MONGOUSER = credentials["MONGOUSER"]
MONGOPWD = credentials["MONGOPWD"]
HOST = parameters["HOST"]
client = connect_mongo(HOST, MONGOUSER, MONGOPWD)
db = client["linkedin"]
users = db["users"]
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
driver.get("https://www.linkedin.com")
login(driver, LINUSERNAME, LINPWD)
us = UserScraper(driver)
for query in QUERIES:
driver.get("https://www.google.com")
sleep(2)
search_query = driver.find_element_by_name('q')
try:
search_query.send_keys(query)
except ElementNotInteractableException:
print("ERROR :: Cannot send query. Google might be blocking")
sys.exit(1)
sleep(0.5)
search_query.send_keys(Keys.RETURN)
profile_urls = get_profile_urls(driver, N_PAGES)
if len(profile_urls) == 0:
print()
print("WARNING :: " +
"Could not get any URLs for the query\n" + query)
print("Please double-check that Google is not " +
"blocking the query")
continue
unseen_urls = get_unseen_urls(users, profile_urls)
if len(unseen_urls) != 0:
print("INFO :: Resuming from URL", unseen_urls[0])
else:
print("INFO :: All URLs from " + str(N_PAGES) +
" Google-search page(s) for the query " + query +
" have already been scraped. " +
"Moving onto the next query if any.")
continue
for url in unseen_urls:
user_data = us.scrape_user(query, url)
if user_data and\
not db["users"].count_documents(user_data, limit=1):
print_scraped_data(user_data)
users.insert_one(user_data)
driver.quit()