FraneCal · eugenrmain · Mar 25, 2025 · Mar 25, 2025 · FraneCal · Mar 26, 2025
diff --git a/google_maps_scraper.py b/google_maps_scraper.py
@@ -1,6 +1,5 @@
 import subprocess
 import pandas as pd
-from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
@@ -13,8 +12,8 @@
 
 # Setup and initial configurations
 URL = "https://www.google.com/maps"
-service = "ENTER A SERVICE OR A NAME"  # e.g. catering, events, etc. OR starbucks, mcdonalds, etc.
-location = "ENTER LOCATION"  # e.g. London, Germany, etc.
+service = "SERVICE"  # e.g. catering, events, etc.
+location = "LOCATION"  # e.g. London, Germany, etc.
 
 print("Starting the web scraping script...")
 
@@ -27,112 +26,120 @@
 # Accept cookies
 try:
     print("Looking for accept cookies button...")
-    accept_cookies = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button')))
+    accept_cookies = WebDriverWait(driver, 10).until(
+        EC.element_to_be_clickable((By.XPATH, '//*[@id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/div[1]/form[2]/div/div/button'))
+    )
     accept_cookies.click()
     print("Accepted cookies.")
-except NoSuchElementException:
-    print("No accept cookies button found.")
+except Exception:
+    print("No accept cookies button found or already accepted.")
 
 # Search for results
 print(f"Searching for: {service} in {location}")
-input_field = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]')))
-input_field.send_keys(service.lower() + ' ' + location.lower())
+input_field = WebDriverWait(driver, 10).until(
+    EC.element_to_be_clickable((By.XPATH, '//*[@id="searchboxinput"]'))
+)
+input_field.send_keys(f"{service} {location}")
 input_field.send_keys(Keys.ENTER)
 print("Search submitted.")
 
-# Wait for the sidebar to load
-print("Waiting for the sidebar to load...")
-divSideBar = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"div[aria-label*='{service.lower()} {location.lower()}']")))
+# Wait for results to load
+time.sleep(5)
 
-# Scroll through the results
-print("Scrolling the sidebar to load all of the results...")
-previous_scroll_height = driver.execute_script("return arguments[0].scrollHeight", divSideBar)
+# Scroll and collect clickable result cards
+print("Scrolling to load all business listings...")
+scrollable_div_xpath = '//div[@role="feed"]'
+scrollable_div = WebDriverWait(driver, 10).until(
+    EC.presence_of_element_located((By.XPATH, scrollable_div_xpath))
+)
+
+last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
 while True:
-    driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", divSideBar)
-    time.sleep(3)
-    new_scroll_height = driver.execute_script("return arguments[0].scrollHeight", divSideBar)
-    if new_scroll_height == previous_scroll_height:
+    driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_div)
+    time.sleep(2)
+    new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
+    if new_height == last_height:
         break
-    previous_scroll_height = new_scroll_height
-print("Finished scrolling.")
-
-# Parse the page source
-print("Parsing the page source...")
-page_source = driver.page_source
-driver.quit()
+    last_height = new_height
 
-soup = BeautifulSoup(page_source, "html.parser")
-boxes = soup.find_all('div', class_='Nv2PK')
+print("Finished scrolling. Collecting business profiles...")
+business_cards = driver.find_elements(By.CLASS_NAME, "Nv2PK")
 
-# Collect data
-print("Collecting data...")
 data = []
 
-for box in boxes:
-    # Business name
-    try:
-        business_name = box.find('div', class_='qBF1Pd').getText()
-    except AttributeError:
-        business_name = "N/A"
-
-    # Address
-    try:
-        inner_div = box.find_all('div', class_='W4Efsd')[1].find('div', class_='W4Efsd')
-        address = [span.text for span in inner_div.find_all('span') if span.text and not span.find('span')][-1]
-    except (IndexError, AttributeError):
-        address = "N/A"
-
-    # Stars
-    try:
-        stars = box.find('span', class_='MW4etd').getText()
-    except AttributeError:
-        stars = "N/A"
-
-    # Number of reviews
-    try:
-        number_of_reviews = box.find('span', class_='UY7F9').getText().strip('()')
-    except AttributeError:
-        number_of_reviews = "N/A"
-
-    # Phone number
-    try:
-        phone_number = box.find('span', class_='UsdlK').getText()
-    except AttributeError:
-        phone_number = "N/A"
-
-    # Website
+for i, card in enumerate(business_cards):
     try:
-        website = box.find('a', class_='lcr4fd').get('href')
-    except AttributeError:
-        website = "N/A"
-
-    # Append to data list
-    data.append({
-        'Business Name': business_name,
-        'Address': address,
-        'Stars': stars,
-        'Number of Reviews': number_of_reviews,
-        'Phone Number': phone_number,
-        'Website': website,
-        'Email': ' ',
-    })
-
-# Create a DataFrame and save to Excel
+        print(f"Processing business {i+1}/{len(business_cards)}")
+        driver.execute_script("arguments[0].scrollIntoView();", card)
+        time.sleep(1)
+        card.click()
+        time.sleep(3)  # Let the profile panel load
+
+        # Collect business data from the profile panel
+        try:
+            name = driver.find_element(By.CLASS_NAME, "DUwDvf").text
+        except:
+            name = "N/A"
+
+        try:
+            address = driver.find_element(By.XPATH, "//button[contains(@data-item-id, 'address')]//div[2]/div[1]").text
+        except:
+            address = "N/A"
+
+        try:
+            phone = driver.find_element(By.XPATH, "//button[contains(@data-item-id, 'phone')]//div[2]/div[1]").text
+        except:
+            phone = "N/A"
+
+        try:
+            website = driver.find_element(By.XPATH, "//a[contains(@data-item-id, 'authority')]" ).get_attribute("href")
+        except:
+            website = "N/A"
+
+        try:
+            stars = driver.find_element(By.CLASS_NAME, "F7nice").text
+        except:
+            stars = "N/A"
+
+        try:
+            reviews = driver.find_element(By.CLASS_NAME, "UY7F9").text.strip("()")
+        except:
+            reviews = "N/A"
+
+        data.append({
+            'Business Name': name,
+            'Address': address,
+            'Stars': stars,
+            'Number of Reviews': reviews,
+            'Phone Number': phone,
+            'Website': website,
+            'Email': ' ',
+        })
+
+        # Back to the results panel
+        time.sleep(2)
+        back_button = driver.find_element(By.CLASS_NAME, "RVQdVd")
+        if back_button:
+            back_button.click()
+            time.sleep(2)
+
+    except Exception as e:
+        print(f"Error processing card {i+1}: {e}")
+        continue
+
+# Save to Excel
 excel_file = f'{location}_{service}.xlsx'
 df = pd.DataFrame(data)
 df.to_excel(excel_file, index=False)
 
 print(f"Data has been saved to {excel_file}")
 
-# Create a configuration file
-config = {
-    'excel_file': excel_file
-}
+# Save config
 with open('config.json', 'w') as config_file:
-    json.dump(config, config_file)
+    json.dump({ 'excel_file': excel_file }, config_file)
 print("Configuration file created: config.json")
 
-# Call the email extraction script
+# Run the email extraction script
 print("Calling the email extraction script...")
 subprocess.run(['python', 'email_extraction_script.py'])
 print("Email extraction script completed.")
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
-selenium==4.16.1
-pandas==2.0.2
-beautifulsoup4==4.12.2
+selenium
+pandas
+beautifulsoup4