From 13b72f9203a9dd8917a0b626d9ba9cc970a60284 Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Thu, 16 May 2024 16:24:47 +0800 Subject: [PATCH 1/9] merge fixes (#1) * fix person.py * Update person.py * Update person.py * Update person.py * Update person.py --------- Co-authored-by: Sachin Shankar --- linkedin_scraper/person.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index 86d169e..7893688 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -11,7 +11,7 @@ class Person(Scraper): - __TOP_CARD = "pv-top-card" + __TOP_CARD = "scaffold-layout__main" __WAIT_FOR_ELEMENT_TIMEOUT = 5 def __init__( @@ -115,7 +115,7 @@ def get_experiences(self): self.scroll_to_bottom() main_list = self.wait_for_element_to_load(name="pvs-list", base=main) for position in main_list.find_elements(By.XPATH,"li"): - position = position.find_element(By.CLASS_NAME,"pvs-entity") + position = position.find_element(By.CLASS_NAME,"pvs-entity--padded") company_logo_elem, position_details = position.find_elements(By.XPATH,"*") # company elem @@ -240,9 +240,9 @@ def get_educations(self): self.add_education(education) def get_name_and_location(self): - top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel") - self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text - self.location = top_panels[1].find_element(By.TAG_NAME,"span").text + top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']") + self.name = top_panel.find_element(By.TAG_NAME, "h1").text + self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']") def get_about(self): From c67544d4bd2843936d22ce3ab6a40566b6bee415 Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Thu, 16 May 2024 16:27:19 +0800 Subject: [PATCH 2/9] Update person.py --- linkedin_scraper/person.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index 7893688..6290932 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -242,7 +242,7 @@ def get_educations(self): def get_name_and_location(self): top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']") self.name = top_panel.find_element(By.TAG_NAME, "h1").text - self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']") + self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text def get_about(self): From 5fc601424331b03ff8b3de337650a61e5078c279 Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Fri, 17 May 2024 01:38:51 +0800 Subject: [PATCH 3/9] fixed person.py working as of 17 May --- linkedin_scraper/person.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index 6290932..d2ccc47 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -113,10 +113,10 @@ def get_experiences(self): main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main") self.scroll_to_half() self.scroll_to_bottom() - main_list = self.wait_for_element_to_load(name="pvs-list", base=main) - for position in main_list.find_elements(By.XPATH,"li"): - position = position.find_element(By.CLASS_NAME,"pvs-entity--padded") - company_logo_elem, position_details = position.find_elements(By.XPATH,"*") + main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main) + for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"): + position = position.find_element(By.XPATH, "//div[@data-view-name='profile-component-entity']") + company_logo_elem, position_details = position.find_elements(By.XPATH, "*") # company elem company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href") @@ -150,8 +150,8 @@ def get_experiences(self): from_date = " ".join(times.split(" ")[:2]) if times else "" to_date = " ".join(times.split(" ")[3:]) if times else "" - if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1: - descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li") + if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")) > 1: + descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li") for description in descriptions: res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*") position_title_elem = res[0] if len(res) > 0 else None @@ -200,8 +200,9 @@ def get_educations(self): main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main") self.scroll_to_half() self.scroll_to_bottom() - main_list = self.wait_for_element_to_load(name="pvs-list", base=main) - for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"): + main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main) + for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"): + position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']") institution_logo_elem, position_details = position.find_elements(By.XPATH,"*") # company elem @@ -211,7 +212,7 @@ def get_educations(self): position_details_list = position_details.find_elements(By.XPATH,"*") position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None - outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*") + outer_positions = position_summary_details.find_element(By.XPATH,"*") institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text degree = outer_positions[1].find_element(By.TAG_NAME,"span").text @@ -303,7 +304,7 @@ def scrape_logged_in(self, close_on_complete=True): interestContainer = driver.find_element(By.XPATH, "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']" ) - for interestElement in interestContainer.find_elements(By.XPATH, + for interestElement in interestContainer.find_elements(By.XPATH, "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']" ): interest = Interest( @@ -326,11 +327,11 @@ def scrape_logged_in(self, close_on_complete=True): acc = driver.find_element(By.XPATH, "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']" ) - for block in acc.find_elements(By.XPATH, + for block in acc.find_elements(By.XPATH, "//div[@class='pv-accomplishments-block__content break-words']" ): category = block.find_element(By.TAG_NAME, "h3") - for title in block.find_element(By.TAG_NAME, + for title in block.find_element(By.TAG_NAME, "ul" ).find_elements(By.TAG_NAME, "li"): accomplishment = Accomplishment(category.text, title.text) From 98cde777c0a7b8667d3325440cfe32925a356e6a Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Fri, 17 May 2024 01:56:22 +0800 Subject: [PATCH 4/9] fixed person.py working as of 17 May --- linkedin_scraper/person.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index d2ccc47..a0e4378 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -212,7 +212,7 @@ def get_educations(self): position_details_list = position_details.find_elements(By.XPATH,"*") position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None - outer_positions = position_summary_details.find_element(By.XPATH,"*") + outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*") institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text degree = outer_positions[1].find_element(By.TAG_NAME,"span").text @@ -220,8 +220,9 @@ def get_educations(self): if len(outer_positions) > 2: times = outer_positions[2].find_element(By.TAG_NAME,"span").text - from_date = " ".join(times.split(" ")[:2]) - to_date = " ".join(times.split(" ")[3:]) + if times != "": + from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0] + to_date = times.split(" ")[-1] else: from_date = None to_date = None From 37bdaeb39442b84946ae2f3ca0ebbb7bfa81809e Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Sat, 18 May 2024 00:00:43 +0800 Subject: [PATCH 5/9] fixed person.py working as of 18 May --- linkedin_scraper/person.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index a0e4378..1c297a5 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -215,7 +215,10 @@ def get_educations(self): outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*") institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text - degree = outer_positions[1].find_element(By.TAG_NAME,"span").text + if len(outer_positions) > 1: + degree = outer_positions[1].find_element(By.TAG_NAME,"span").text + else: + degree = None if len(outer_positions) > 2: times = outer_positions[2].find_element(By.TAG_NAME,"span").text From d942d69b430a7951a7e31f61b49812343a476c9a Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Sat, 18 May 2024 01:24:48 +0800 Subject: [PATCH 6/9] create input loop and remove unneeded flow --- linkedin_scraper/person.py | 70 -------------------------------------- samples/scrape_person.py | 34 +++++++++++++++--- 2 files changed, 30 insertions(+), 74 deletions(-) diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index 1c297a5..a0f3157 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -249,7 +249,6 @@ def get_name_and_location(self): self.name = top_panel.find_element(By.TAG_NAME, "h1").text self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text - def get_about(self): try: about = self.driver.find_element(By.ID,"about").find_element(By.XPATH,"..").find_element(By.CLASS_NAME,"display-flex").text @@ -293,75 +292,6 @@ def scrape_logged_in(self, close_on_complete=True): self.get_educations() driver.get(self.linkedin_url) - - # get interest - try: - - _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( - EC.presence_of_element_located( - ( - By.XPATH, - "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']", - ) - ) - ) - interestContainer = driver.find_element(By.XPATH, - "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']" - ) - for interestElement in interestContainer.find_elements(By.XPATH, - "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']" - ): - interest = Interest( - interestElement.find_element(By.TAG_NAME, "h3").text.strip() - ) - self.add_interest(interest) - except: - pass - - # get accomplishment - try: - _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( - EC.presence_of_element_located( - ( - By.XPATH, - "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']", - ) - ) - ) - acc = driver.find_element(By.XPATH, - "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']" - ) - for block in acc.find_elements(By.XPATH, - "//div[@class='pv-accomplishments-block__content break-words']" - ): - category = block.find_element(By.TAG_NAME, "h3") - for title in block.find_element(By.TAG_NAME, - "ul" - ).find_elements(By.TAG_NAME, "li"): - accomplishment = Accomplishment(category.text, title.text) - self.add_accomplishment(accomplishment) - except: - pass - - # get connections - try: - driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/") - _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( - EC.presence_of_element_located((By.CLASS_NAME, "mn-connections")) - ) - connections = driver.find_element(By.CLASS_NAME, "mn-connections") - if connections is not None: - for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"): - anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link") - url = anchor.get_attribute("href") - name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip() - occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip() - - contact = Contact(name=name, occupation=occupation, url=url) - self.add_contact(contact) - except: - connections = None - if close_on_complete: driver.quit() diff --git a/samples/scrape_person.py b/samples/scrape_person.py index 7d4e93f..8578d2a 100644 --- a/samples/scrape_person.py +++ b/samples/scrape_person.py @@ -1,9 +1,35 @@ import os -from linkedin_scraper import Person, actions +from linkedin_scraper import Person, actions, Company from selenium import webdriver -driver = webdriver.Chrome("./chromedriver") + +driver = webdriver.Chrome() email = os.getenv("LINKEDIN_USER") password = os.getenv("LINKEDIN_PASSWORD") -actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal -person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver) +actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal +user_input = [] +urls = [] +while True: + user_input = input("Enter a comma-separated list of strings: ") + if user_input == "exit": + break + urls = user_input.split(",") + results = [] + for url in urls: + print(f'scraping {url}') + person = Person(url, driver=driver, close_on_complete=False) + company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False) + results.append((person, company)) + + print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size') + for person, company in results: + experience = person.experiences[0] + print(f'"{person.name}", ' + f'"{person.location}", ' + f'"{experience.position_title}", ' + f'"{experience.institution_name}", ' + f'"{experience.linkedin_url}", ' + f'"{company.industry}", ' + f'"{company.website}", ' + f'"{company.company_size}", ' + ) From 31612bf9f9080536060d929b4ca4fe1e3397f66c Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Sun, 19 May 2024 18:42:07 +0800 Subject: [PATCH 7/9] create loop program, log when error, and fix scraper --- linkedin_scraper/company.py | 6 +++--- linkedin_scraper/person.py | 23 +++++++++++++++++++---- samples/scrape_person.py | 3 ++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py index 77900eb..9293597 100644 --- a/linkedin_scraper/company.py +++ b/linkedin_scraper/company.py @@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True): driver.get(self.linkedin_url) - _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]'))) + _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]'))) navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ") - self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip() + self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip() # Click About Tab or View All Link try: @@ -360,6 +360,6 @@ def __repr__(self): _output['affiliated_companies'] = self.affiliated_companies _output['employees'] = self.employees _output['headcount'] = self.headcount - + return json.dumps(_output).replace('\n', '') diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index a0f3157..9c83217 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -1,3 +1,5 @@ +import time + import requests from selenium import webdriver from selenium.webdriver.common.by import By @@ -115,11 +117,13 @@ def get_experiences(self): self.scroll_to_bottom() main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main) for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"): - position = position.find_element(By.XPATH, "//div[@data-view-name='profile-component-entity']") + position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']") company_logo_elem, position_details = position.find_elements(By.XPATH, "*") # company elem company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href") + if not company_linkedin_url: + continue # position details position_details_list = position_details.find_elements(By.XPATH,"*") @@ -143,15 +147,26 @@ def get_experiences(self): company = outer_positions[0].find_element(By.TAG_NAME,"span").text work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text location = outer_positions[2].find_element(By.TAG_NAME,"span").text + else: + position_title = "" + company = outer_positions[0].find_element(By.TAG_NAME,"span").text + work_times = "" + location = "" + times = work_times.split("·")[0].strip() if work_times else "" duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None from_date = " ".join(times.split(" ")[:2]) if times else "" to_date = " ".join(times.split(" ")[3:]) if times else "" - - if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li")) > 1: - descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container").find_element(By.CLASS_NAME,"pvs-list__container").find_elements(By.XPATH,"li") + if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")): + inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container") + .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*") + .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item")) + else: + inner_positions = [] + if len(inner_positions) > 1: + descriptions = inner_positions for description in descriptions: res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*") position_title_elem = res[0] if len(res) > 0 else None diff --git a/samples/scrape_person.py b/samples/scrape_person.py index 8578d2a..c34f70e 100644 --- a/samples/scrape_person.py +++ b/samples/scrape_person.py @@ -10,7 +10,7 @@ user_input = [] urls = [] while True: - user_input = input("Enter a comma-separated list of strings: ") + user_input = input("Enter a comma-separated list of linkedin urls: ") if user_input == "exit": break urls = user_input.split(",") @@ -21,6 +21,7 @@ company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False) results.append((person, company)) + print('RESULTS:') print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size') for person, company in results: experience = person.experiences[0] From 9ec8d1017c860badf1a87d303a6063d0ec20f705 Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Fri, 2 Aug 2024 22:45:09 +0800 Subject: [PATCH 8/9] Update README.md --- README.md | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/README.md b/README.md index d8f3988..0e6150a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Linkedin Scraper - +Forked from https://github.com/joeyism/linkedin_scraper Scrapes Linkedin User Data [Linkedin Scraper](#linkedin-scraper) @@ -59,21 +59,6 @@ First, you must set your chromedriver location by export CHROMEDRIVER=~/chromedriver ``` -## Sponsor -[![rds-cost](https://raw.githubusercontent.com/joeyism/linkedin_scraper/master/docs/proxycurl.png)](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism) - -Scrape public LinkedIn profile data at scale with [Proxycurl APIs](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism). - -• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.
-• GDPR, CCPA, SOC2 compliant
-• High rate limit - 300 requests/minute
-• Fast - APIs respond in ~2s
-• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days
-• High accuracy
-• Tons of data points returned per profile - -Built for developers, by developers. - ## Usage To use it, just create the class. @@ -283,7 +268,3 @@ company = Company("https://ca.linkedin.com/company/google", driver=driver) #### `scrape(close_on_complete=True)` This is the meat of the code, where execution of this function scrapes the company. If *close_on_complete* is True (which it is by default), then the browser will close upon completion. If scraping of other companies are desired, then you might want to set that to false so you can keep using the same driver. - -## Contribution - -Buy Me A Coffee From 896c5f3e575654c5a566aa404e1f04af0f128ca2 Mon Sep 17 00:00:00 2001 From: Nguyen Quoc Viet Date: Fri, 2 Aug 2024 22:45:56 +0800 Subject: [PATCH 9/9] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0e6150a..a1ef7d3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Linkedin Scraper Forked from https://github.com/joeyism/linkedin_scraper + Scrapes Linkedin User Data [Linkedin Scraper](#linkedin-scraper)