-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.ts
100 lines (89 loc) · 3.19 KB
/
crawler.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import puppeteer from 'puppeteer';
import * as path from 'path';
import * as fs from 'fs';
import * as os from 'os';
import {
Config, Parser, GenericParser, isSelectorString, ParsedFieldsMap
} from './configTypes';
import { Database } from './database';
export default async function (config: Config) {
/**
* I disable sandbox because our docker image facing "No usable sandbox!"
* issue.I think it is okay because blogsearch-crawler crawls trusted websites
* (aka. your static websites.)
*/
const browser = await puppeteer.launch({
ignoreHTTPSErrors: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const db = await Database.create({
filename: config.output,
columns: config.fields
});
let rowid = 0;
const numCores = os.cpus().length;
console.log(`${config.entries.length} entries found.`);
console.log(`${numCores} cores detected. Running ${numCores} workers.`);
await Promise.all([...Array(numCores).keys()]
.map(taskNumber => crawlerWorker(config, taskNumber)));
await browser.close();
await db.close();
return;
async function crawlerWorker ({ fields, entries }: Config, workerNumber: number) {
console.log(`Worker ${workerNumber} starting...`);
const context = await browser.createIncognitoBrowserContext();
// Use Array.pop() for iteration because multiple async crawler tasks consume the same array
while (entries.length !== 0) {
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const entry = entries.pop()!;
console.log(`Worker ${workerNumber}: ${entry}`);
const page = await context.newPage();
try {
await page.goto(resolveEntry(entry), {
waitUntil: 'networkidle2'
});
} catch (error) {
throw new Error(`Failed to open a page of ${entry}: ${error}`);
}
const parsedFields: ParsedFieldsMap = new Map();
for (const [field, fieldConfig] of fields) {
const parser = checkParser(fieldConfig.parser);
try {
const parsed = await parser(entry, page);
if (parsed === null) {
throw new Error('Parsing result is null.');
}
parsedFields.set(field, { config: fieldConfig, value: parsed.toString() });
} catch (error) {
const msg = error instanceof Error ? error.message : error;
throw new Error(`Failed to parse '${field}' field in ${entry}: ${msg}`);
}
}
await db.insert(rowid++, parsedFields);
await page.close();
}
await context.close();
console.log(`Worker ${workerNumber} finished.`);
return;
}
}
function resolveEntry (entry: string): string {
if (entry.startsWith('http://') || entry.startsWith('https://') ||
entry.startsWith('file://')) {
return entry;
} else if (fs.existsSync(entry)) {
return `file://${path.resolve(entry)}`;
} else {
throw new Error(`Failed to resolve the entry: ${entry}`);
}
}
function checkParser (parser: Parser): GenericParser<string | null> {
if (isSelectorString(parser)) {
const selector = parser;
return (_, page) => page.$eval(selector, el => el.textContent);
} else if (!parser) {
return () => '';
} else {
return parser;
}
}