IncompleteSustainablePythonWebCrawlingScriptsMisc/concurrent_script.py at main · darcyturk/IncompleteSustainablePythonWebCrawlingScriptsMisc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import asyncio
import csv
import re
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

INPUT_CSV = "collected_thumbnails.csv"
OUTPUT_CSV = "extracted_full_data.csv"
DOCUMENT_BASE_URL = "https://www.bfgsupply.com/"
CONCURRENT_TASKS = 25
PAGE_LOAD_WAIT = 5000  # in milliseconds

def extract_ids(url):
    match = re.search(r"/product/(\d+)/(\d+)/(.*)", url)
    if match:
        return match.group(1), match.group(2), match.group(3)
    return "", "", ""

def read_urls(file_path):
    df = pd.read_csv(file_path)
    col = df.columns[0]
    return [url.strip() for url in df[col].dropna() if str(url).strip()]

async def scrape_page(page, url):
    try:
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(PAGE_LOAD_WAIT)
        content = await page.content()
        soup = BeautifulSoup(content, "html.parser")

        categoryID, productID, slugID = extract_ids(url)
        name_tag = soup.select_one("#productDetailPage > div.product-detail > div.main h3")
        name = name_tag.get_text(strip=True) if name_tag else ""
        image_tag = soup.select_one("div.top div.main-image img")
        image_url = image_tag["src"] if image_tag and image_tag.has_attr("src") else ""

        doc_links = []
        for a in soup.select("div.details table.document-list-table td:nth-child(2) ul li a"):
            href = a.get("href")
            if href:
                doc_links.append(href if href.startswith("http") else DOCUMENT_BASE_URL + href.lstrip("/"))

        data = {
            "URL": url,
            "categoryID": categoryID,
            "productID": productID,
            "slugID": slugID,
            "Name": name or "NULL",
            "Image": image_url or "NULL",
            "Documents": ", ".join(doc_links) or "NULL"
        }

        print(data)
        return data
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

async def run():
    urls = read_urls(INPUT_CSV)
    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        sem = asyncio.Semaphore(CONCURRENT_TASKS)

        async def process_url(url):
            async with sem:
                page = await browser.new_page()
                data = await scrape_page(page, url)
                await page.close()
                if data:
                    results.append(data)

        await asyncio.gather(*(process_url(url) for url in urls))
        await browser.close()

    if results:
        keys = results[0].keys()
        with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(results)
        print(f"[✓] Scraped and saved {len(results)} rows to {OUTPUT_CSV}")

if __name__ == "__main__":
    asyncio.run(run())