-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawlee_script.py
More file actions
67 lines (57 loc) · 2.15 KB
/
crawlee_script.py
File metadata and controls
67 lines (57 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import { PlaywrightCrawler, DefaultFingerprintGenerator } from 'crawlee';
import fs from 'fs';
// Load URLs from CSV
const inputCsvPath = 'collected_thumbnails.csv';
const urls = fs.readFileSync(inputCsvPath, 'utf8')
.split('\n')
.filter((line, idx) => idx > 0 && line.trim() !== '');
// Setup fingerprint generator
const fingerprintGenerator = new DefaultFingerprintGenerator({
browsers: [{ name: 'chrome', minVersion: 96 }],
devices: ['desktop'],
locales: ['en-US'],
});
const crawler = new PlaywrightCrawler({
launchContext: {
launchOptions: {
headless: true,
},
},
preNavigationHooks: [
async ({ launchContext }) => {
const fingerprint = fingerprintGenerator.getFingerprint({
browserName: 'chrome',
});
launchContext.useFingerprint(fingerprint);
},
],
requestHandler: async ({ page, request, log }) => {
log.info(`Scraping: ${request.url}`);
try {
const rows = await page.$$eval('tr.tablescraper-selected-row, tr.tablescraper-selected-row2, tr.tablescraper-selected-row3, tr.tablescraper-selected-row4', trs => {
return trs.map(tr => {
const cells = Array.from(tr.querySelectorAll('th, td'));
return cells.map(td => td.innerText.trim());
});
});
// Build a CSV-style output
const headers = ['Label'];
for (let i = 1; i < rows[0]?.length; i++) {
headers.push(`tablescraper-selected-row ${i}`);
}
const formattedRows = rows.map(row => {
while (row.length < headers.length) row.push('');
return row;
});
console.log(`\n🧾 [Extracted Table from ${request.url}]`);
console.log(headers.join(','));
formattedRows.forEach(row => {
console.log(row.join(','));
});
} catch (err) {
log.warning(`Failed to extract from: ${request.url}`);
}
},
maxRequestsPerCrawl: urls.length,
});
await crawler.run(urls.map(url => ({ url })));