IncompleteSustainablePythonWebCrawlingScriptsMisc/script_playwright.py at main · darcyturk/IncompleteSustainablePythonWebCrawlingScriptsMisc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import asyncio
import csv
import pandas as pd
from playwright.async_api import async_playwright

# Configuration
INPUT_CSV = 'collected_thumbnails.csv'
OUTPUT_CSV = 'extracted_tables.csv'
WAIT_TIME = 5  # seconds

escaped_classes = [
    "tablescraper-selected-row",
    "tablescraper-selected-row\\.2",
    "tablescraper-selected-row\\.3",
    "tablescraper-selected-row\\.4",
]

def read_urls(file_path):
    df = pd.read_csv(file_path)
    column_name = df.columns[0]
    return [url.strip() for url in df[column_name].dropna() if str(url).strip()]

async def scrape_table(page, url):
    await page.goto(url)
    await page.wait_for_timeout(WAIT_TIME * 1000)

    table_data = {"URL": url}
    rows = []

    for row_class in escaped_classes:
        elements = await page.query_selector_all(f'tr.{row_class}')
        for element in elements:
            cells = await element.query_selector_all("th, td")
            cell_texts = [await cell.inner_text() for cell in cells]
            rows.append(cell_texts)

    if not rows:
        print(f"❌ No table found at {url}")
        return None

    headers = [row[0] for row in rows if row]
    for i, header in enumerate(headers):
        for j, value in enumerate(rows[i][1:], start=1):
            table_data[f"{header}_COL{j}"] = value

    print(f"✅ Extracted table from {url}:")
    for row in rows:
        print(row)

    return table_data

async def main():
    print("[START] Crawlee-style Playwright scraper starting.")
    urls = read_urls(INPUT_CSV)
    print(f"Found {len(urls)} URLs in {INPUT_CSV}")
    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        for i, url in enumerate(urls):
            print(f"[{i+1}/{len(urls)}] Scraping {url}")
            try:
                data = await scrape_table(page, url)
                if data:
                    results.append(data)
            except Exception as e:
                print(f"❌ Failed to scrape {url}: {e}")

        await browser.close()

    if results:
        keys = sorted({k for r in results for k in r.keys()})
        with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(results)
        print(f"\n[FINISH] Wrote {len(results)} results to {OUTPUT_CSV}")
    else:
        print("\n[FINISH] No data to write.")

if __name__ == '__main__':
    asyncio.run(main())