diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py index fb077f1..c777352 100644 --- a/linkedin_scraper/person.py +++ b/linkedin_scraper/person.py @@ -41,6 +41,7 @@ def __init__( self.accomplishments = accomplishments or [] self.also_viewed_urls = [] self.contacts = contacts or [] + self.scraped_education_keys = set() # Includes the fix for duplicates if driver is None: try: @@ -116,17 +117,28 @@ def get_experiences(self): main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main) for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"): position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']") - company_logo_elem, position_details = position.find_elements(By.XPATH, "*") - # company elem - company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href") - if not company_linkedin_url: + elements = position.find_elements(By.XPATH, "*") + if len(elements) < 2: + continue + + company_logo_elem = elements[0] + position_details = elements[1] + + try: + company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href") + if not company_linkedin_url: + continue + except NoSuchElementException: continue - # position details position_details_list = position_details.find_elements(By.XPATH,"*") position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None + + if not position_summary_details: + continue + outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*") if len(outer_positions) == 4: @@ -147,50 +159,68 @@ def get_experiences(self): location = outer_positions[2].find_element(By.TAG_NAME,"span").text else: position_title = "" - company = outer_positions[0].find_element(By.TAG_NAME,"span").text - work_times = "" + company = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else "" + work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else "" location = "" - - times = work_times.split("·")[0].strip() if work_times else "" - duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None + if work_times: + parts = work_times.split("·") + times = parts[0].strip() if parts else "" + duration = parts[1].strip() if len(parts) > 1 else None + else: + times = "" + duration = None from_date = " ".join(times.split(" ")[:2]) if times else "" - to_date = " ".join(times.split(" ")[3:]) if times else "" - if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")): - inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container") - .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*") - .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item")) + to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else "" + + if position_summary_text and any(element.get_attribute("class") == "pvs-list__container" for element in position_summary_text.find_elements(By.XPATH, "*")): + try: + inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container") + .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*") + .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item")) + except NoSuchElementException: + inner_positions = [] else: inner_positions = [] + if len(inner_positions) > 1: descriptions = inner_positions for description in descriptions: - res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*") - position_title_elem = res[0] if len(res) > 0 else None - work_times_elem = res[1] if len(res) > 1 else None - location_elem = res[2] if len(res) > 2 else None - - - location = location_elem.find_element(By.XPATH,"*").text if location_elem else None - position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else "" - work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else "" - times = work_times.split("·")[0].strip() if work_times else "" - duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None - from_date = " ".join(times.split(" ")[:2]) if times else "" - to_date = " ".join(times.split(" ")[3:]) if times else "" - - experience = Experience( - position_title=position_title, - from_date=from_date, - to_date=to_date, - duration=duration, - location=location, - description=description, - institution_name=company, - linkedin_url=company_linkedin_url - ) - self.add_experience(experience) + try: + res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*") + position_title_elem = res[0] if len(res) > 0 else None + work_times_elem = res[1] if len(res) > 1 else None + location_elem = res[2] if len(res) > 2 else None + + location = location_elem.find_element(By.XPATH,"*").text if location_elem else None + position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else "" + work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else "" + + if work_times: + parts = work_times.split("·") + times = parts[0].strip() if parts else "" + duration = parts[1].strip() if len(parts) > 1 else None + else: + times = "" + duration = None + + from_date = " ".join(times.split(" ")[:2]) if times else "" + to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else "" + + experience = Experience( + position_title=position_title, + from_date=from_date, + to_date=to_date, + duration=duration, + location=location, + description=description, + institution_name=company, + linkedin_url=company_linkedin_url + ) + self.add_experience(experience) + except (NoSuchElementException, IndexError) as e: + continue else: description = position_summary_text.text if position_summary_text else "" @@ -215,47 +245,62 @@ def get_educations(self): self.scroll_to_bottom() main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main) for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"): - position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']") - institution_logo_elem, position_details = position.find_elements(By.XPATH,"*") - - # company elem - institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href") - - # position details - position_details_list = position_details.find_elements(By.XPATH,"*") - position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None - position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None - outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*") - - institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text - if len(outer_positions) > 1: - degree = outer_positions[1].find_element(By.TAG_NAME,"span").text - else: - degree = None - - if len(outer_positions) > 2: - times = outer_positions[2].find_element(By.TAG_NAME,"span").text - - if times != "": - from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0] - to_date = times.split(" ")[-1] - else: - from_date = None - to_date = None - - - - description = position_summary_text.text if position_summary_text else "" + try: + position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']") + + elements = position.find_elements(By.XPATH,"*") + if len(elements) < 2: + continue + + institution_logo_elem = elements[0] + position_details = elements[1] + + try: + institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href") + except NoSuchElementException: + institution_linkedin_url = None + + position_details_list = position_details.find_elements(By.XPATH,"*") + position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None + + if not position_summary_details: + continue + + outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*") + + institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else "" + degree = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else None + + from_date, to_date = None, None + + if len(outer_positions) > 2: + try: + times = outer_positions[2].find_element(By.TAG_NAME,"span").text + if times and "-" in times: + parts = [p.strip() for p in times.split("-")] + from_date = parts[0] + to_date = parts[1] + except (NoSuchElementException, ValueError): + pass + + description = position_details_list[1].text if len(position_details_list) > 1 else "" + + education_key = (institution_name, degree, from_date, to_date) + + if education_key not in self.scraped_education_keys: + education = Education( + from_date=from_date, + to_date=to_date, + description=description, + degree=degree, + institution_name=institution_name, + linkedin_url=institution_linkedin_url + ) + self.add_education(education) + self.scraped_education_keys.add(education_key) - education = Education( - from_date=from_date, - to_date=to_date, - description=description, - degree=degree, - institution_name=institution_name, - linkedin_url=institution_linkedin_url - ) - self.add_education(education) + except (NoSuchElementException, IndexError) as e: + continue def get_name_and_location(self): top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']") @@ -284,12 +329,10 @@ def scrape_logged_in(self, close_on_complete=True): self.focus() self.wait(5) - # get name and location self.get_name_and_location() self.open_to_work = self.is_open_to_work() - # get about self.get_about() driver.execute_script( "window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));" @@ -298,15 +341,12 @@ def scrape_logged_in(self, close_on_complete=True): "window.scrollTo(0, Math.ceil(document.body.scrollHeight/1.5));" ) - # get experience self.get_experiences() - # get education self.get_educations() driver.get(self.linkedin_url) - # get interest try: _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( @@ -330,7 +370,6 @@ def scrape_logged_in(self, close_on_complete=True): except: pass - # get accomplishment try: _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( EC.presence_of_element_located( @@ -355,7 +394,6 @@ def scrape_logged_in(self, close_on_complete=True): except: pass - # get connections try: driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/") _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until( @@ -408,4 +446,4 @@ def __repr__(self): int=self.interests, acc=self.accomplishments, conn=self.contacts, - ) + ) \ No newline at end of file