Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 126 additions & 88 deletions linkedin_scraper/person.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
self.accomplishments = accomplishments or []
self.also_viewed_urls = []
self.contacts = contacts or []
self.scraped_education_keys = set() # Includes the fix for duplicates

if driver is None:
try:
Expand Down Expand Up @@ -116,17 +117,28 @@ def get_experiences(self):
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
company_logo_elem, position_details = position.find_elements(By.XPATH, "*")

# company elem
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
if not company_linkedin_url:
elements = position.find_elements(By.XPATH, "*")
if len(elements) < 2:
continue

company_logo_elem = elements[0]
position_details = elements[1]

try:
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
if not company_linkedin_url:
continue
except NoSuchElementException:
continue

# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None

if not position_summary_details:
continue

outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")

if len(outer_positions) == 4:
Expand All @@ -147,50 +159,68 @@ def get_experiences(self):
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
else:
position_title = ""
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
work_times = ""
company = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else ""
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else ""
location = ""


times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
if work_times:
parts = work_times.split("·")
times = parts[0].strip() if parts else ""
duration = parts[1].strip() if len(parts) > 1 else None
else:
times = ""
duration = None

from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""
if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else ""

if position_summary_text and any(element.get_attribute("class") == "pvs-list__container" for element in position_summary_text.find_elements(By.XPATH, "*")):
try:
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
except NoSuchElementException:
inner_positions = []
else:
inner_positions = []

if len(inner_positions) > 1:
descriptions = inner_positions
for description in descriptions:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
work_times_elem = res[1] if len(res) > 1 else None
location_elem = res[2] if len(res) > 2 else None


location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""

experience = Experience(
position_title=position_title,
from_date=from_date,
to_date=to_date,
duration=duration,
location=location,
description=description,
institution_name=company,
linkedin_url=company_linkedin_url
)
self.add_experience(experience)
try:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
work_times_elem = res[1] if len(res) > 1 else None
location_elem = res[2] if len(res) > 2 else None

location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""

if work_times:
parts = work_times.split("·")
times = parts[0].strip() if parts else ""
duration = parts[1].strip() if len(parts) > 1 else None
else:
times = ""
duration = None

from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else ""

experience = Experience(
position_title=position_title,
from_date=from_date,
to_date=to_date,
duration=duration,
location=location,
description=description,
institution_name=company,
linkedin_url=company_linkedin_url
)
self.add_experience(experience)
except (NoSuchElementException, IndexError) as e:
continue
else:
description = position_summary_text.text if position_summary_text else ""

Expand All @@ -215,47 +245,62 @@ def get_educations(self):
self.scroll_to_bottom()
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")

# company elem
institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")

# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")

institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
if len(outer_positions) > 1:
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
else:
degree = None

if len(outer_positions) > 2:
times = outer_positions[2].find_element(By.TAG_NAME,"span").text

if times != "":
from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
to_date = times.split(" ")[-1]
else:
from_date = None
to_date = None



description = position_summary_text.text if position_summary_text else ""
try:
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")

elements = position.find_elements(By.XPATH,"*")
if len(elements) < 2:
continue

institution_logo_elem = elements[0]
position_details = elements[1]

try:
institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
except NoSuchElementException:
institution_linkedin_url = None

position_details_list = position_details.find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None

if not position_summary_details:
continue

outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")

institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else ""
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else None

from_date, to_date = None, None

if len(outer_positions) > 2:
try:
times = outer_positions[2].find_element(By.TAG_NAME,"span").text
if times and "-" in times:
parts = [p.strip() for p in times.split("-")]
from_date = parts[0]
to_date = parts[1]
except (NoSuchElementException, ValueError):
pass

description = position_details_list[1].text if len(position_details_list) > 1 else ""

education_key = (institution_name, degree, from_date, to_date)

if education_key not in self.scraped_education_keys:
education = Education(
from_date=from_date,
to_date=to_date,
description=description,
degree=degree,
institution_name=institution_name,
linkedin_url=institution_linkedin_url
)
self.add_education(education)
self.scraped_education_keys.add(education_key)

education = Education(
from_date=from_date,
to_date=to_date,
description=description,
degree=degree,
institution_name=institution_name,
linkedin_url=institution_linkedin_url
)
self.add_education(education)
except (NoSuchElementException, IndexError) as e:
continue

def get_name_and_location(self):
top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
Expand Down Expand Up @@ -284,12 +329,10 @@ def scrape_logged_in(self, close_on_complete=True):
self.focus()
self.wait(5)

# get name and location
self.get_name_and_location()

self.open_to_work = self.is_open_to_work()

# get about
self.get_about()
driver.execute_script(
"window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));"
Expand All @@ -298,15 +341,12 @@ def scrape_logged_in(self, close_on_complete=True):
"window.scrollTo(0, Math.ceil(document.body.scrollHeight/1.5));"
)

# get experience
self.get_experiences()

# get education
self.get_educations()

driver.get(self.linkedin_url)

# get interest
try:

_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
Expand All @@ -330,7 +370,6 @@ def scrape_logged_in(self, close_on_complete=True):
except:
pass

# get accomplishment
try:
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located(
Expand All @@ -355,7 +394,6 @@ def scrape_logged_in(self, close_on_complete=True):
except:
pass

# get connections
try:
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
Expand Down Expand Up @@ -408,4 +446,4 @@ def __repr__(self):
int=self.interests,
acc=self.accomplishments,
conn=self.contacts,
)
)