Skip to content

Commit

Permalink
Merge branch 'master' of github.com:typesense/showcase-xkcd-search
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonbosco committed Jun 7, 2024
2 parents 5360287 + b326366 commit 4d27518
Show file tree
Hide file tree
Showing 7 changed files with 228 additions and 95 deletions.
42 changes: 24 additions & 18 deletions .eslintrc.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
module.exports = {
"extends": "eslint:recommended",
extends: 'eslint:recommended',
plugins: ['prettier'],
rules: {
'prettier/prettier': 'error',
//https://stackoverflow.com/a/53769213
'prettier/prettier': [
'error',
{
endOfLine: 'auto',
},
],
},
parser: 'babel-eslint',
"parserOptions": {
"ecmaFeatures": {
"jsx": true,
"modules": true
parserOptions: {
ecmaFeatures: {
jsx: true,
modules: true,
},
"ecmaVersion": 2020,
"sourceType": "module",
"useJSXTextNode": true
ecmaVersion: 2020,
sourceType: 'module',
useJSXTextNode: true,
},
"root": true,
"env": {
"browser": true,
"es6": true,
"node": true,
"commonjs": true
root: true,
env: {
browser: true,
es6: true,
node: true,
commonjs: true,
},
globals: {
$: true,
},
"globals": {
"$": true
}
};
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"fetchData": "node scripts/fetchExplainXkcdData.mjs && node scripts/fetchXkcdData.mjs",
"transformData": "node scripts/transformData.mjs",
"indexData": "node scripts/indexData.js",
"refreshData": "yarn fetchData && yarn transformData && yarn indexData",
"refreshData": "yarn transformData && yarn indexData",
"build": "parcel build index.html --public-url https://findxkcd.com"
},
"engines": {
Expand Down
62 changes: 42 additions & 20 deletions scripts/fetchExplainXkcdData.mjs
Original file line number Diff line number Diff line change
@@ -1,32 +1,54 @@
import 'dotenv/config'
import 'dotenv/config';
import fetch from 'node-fetch';
import fs from 'fs';
import { fileURLToPath } from 'url';
import path from 'path';
import { DATA_DIR } from './utils/path.mjs';
import { exponentialBackoffRetry, BatchAPICall } from './utils/network.mjs';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const DATA_DIR = path.resolve(__dirname, '../data/raw')

let response
let response;

response = await fetch('https://xkcd.com/info.0.json');
const latestComicId = (await response.json())['num'];
const comicIds = [...Array(latestComicId + 1).keys()].slice(1);
const batchAPICall = new BatchAPICall();

for await (const comicId of comicIds) {
const filePath = `${DATA_DIR}/${comicId}.html`
if(fs.existsSync(filePath)) {
console.log(`Comic ${comicId} already exists. Skipping.`)
const filePath = `${DATA_DIR}/${comicId}.html`;
if (fs.existsSync(filePath)) {
console.log(`Explanation for comic ${comicId} already exists. Skipping.`);
} else {
console.log(`Fetching explanation for comic ${comicId}.`)
response = await fetch(`https://www.explainxkcd.com/wiki/index.php/${comicId}`);
// 🙏 https://stackoverflow.com/a/51302466/123545
const fileStream = fs.createWriteStream(filePath);
await new Promise((resolve, reject) => {
response.body.pipe(fileStream);
response.body.on("error", reject);
fileStream.on("finish", resolve);
});
const request = async () => {
console.log(`Fetching explanation for comic ${comicId}.`);

const fetchExplanation = async () => {
const res = await fetch(
`https://www.explainxkcd.com/wiki/index.php/${comicId}`
);
if (!res.ok) throw new Error('Request failed!');
return res;
};

try {
const response = await exponentialBackoffRetry(fetchExplanation, {
callback: ({ attempt, delayMs }) =>
console.log(
`Retry fetching explanation for comic ${comicId}: attempt ${attempt} after ${delayMs}ms`
),
});
// 🙏 https://stackoverflow.com/a/51302466/123545
const fileStream = fs.createWriteStream(filePath);
await new Promise((resolve, reject) => {
response.body.pipe(fileStream);
response.body.on('error', reject);
fileStream.on('finish', resolve);
});
return `Explanation ${comicId} success`;
} catch (error) {
console.warn(`Error fetching explanation for comic ${comicId}`);
return `Explanation ${comicId} failed`;
}
};
batchAPICall.requestList.push(request);
}
}

batchAPICall.makeRequests();
70 changes: 49 additions & 21 deletions scripts/fetchXkcdData.mjs
Original file line number Diff line number Diff line change
@@ -1,32 +1,60 @@
import 'dotenv/config'
import 'dotenv/config';
import fetch from 'node-fetch';
import fs from 'fs';
import { fileURLToPath } from 'url';
import path from 'path';
import { DATA_DIR } from './utils/path.mjs';
import { exponentialBackoffRetry, BatchAPICall } from './utils/network.mjs';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const DATA_DIR = path.resolve(__dirname, '../data/raw')

let response
let response;

response = await fetch('https://xkcd.com/info.0.json');
const latestComicId = (await response.json())['num'];
const comicIds = [...Array(latestComicId + 1).keys()].slice(1);

const batchAPICall = new BatchAPICall();

for await (const comicId of comicIds) {
const filePath = `${DATA_DIR}/${comicId}.json`
if(fs.existsSync(filePath)) {
console.log(`Comic ${comicId} already exists. Skipping.`)
} else {
console.log(`Fetching info for comic ${comicId}.`)
response = await fetch(`https://xkcd.com/${comicId}/info.0.json`);
// 🙏 https://stackoverflow.com/a/51302466/123545
const fileStream = fs.createWriteStream(filePath);
await new Promise((resolve, reject) => {
response.body.pipe(fileStream);
response.body.on("error", reject);
fileStream.on("finish", resolve);
});
const filePath = `${DATA_DIR}/${comicId}.json`;
if (fs.existsSync(filePath)) {
console.log(`Comic ${comicId} already exists. Skipping.`);
continue;
}
if (comicId === 404) {
// id 404 is an April fools joke
const writer = fs.createWriteStream(filePath);
writer.write('{}');
continue;
}
const request = async () => {
const fetchInfo = async () => {
console.log(`Fetching info for comic ${comicId}.`);
const res = await fetch(`https://xkcd.com/${comicId}/info.0.json`);

if (!res.ok) throw new Error('Request failed!');
return res;
};

try {
const response = await exponentialBackoffRetry(fetchInfo, {
callback: ({ attempt, delayMs }) =>
console.log(
`Retry fetching info for comic ${comicId}: attempt ${attempt} after ${delayMs}ms`
),
});
// 🙏 https://stackoverflow.com/a/51302466/123545
const fileStream = fs.createWriteStream(filePath);
await new Promise((resolve, reject) => {
response.body.pipe(fileStream);
response.body.on('error', reject);
fileStream.on('finish', resolve);
});
return `Comic info ${comicId} success`;
} catch (error) {
console.warn(`Error fetching explanation for comic ${comicId}`);
return `Comic info ${comicId} failed`;
}
};

batchAPICall.requestList.push(request);
}

batchAPICall.makeRequests();
88 changes: 53 additions & 35 deletions scripts/transformData.mjs
Original file line number Diff line number Diff line change
@@ -1,60 +1,79 @@
import * as cheerio from 'cheerio';
import fs from 'fs';
import {fileURLToPath} from 'url';
import { fileURLToPath } from 'url';
import path from 'path';
import {DateTime} from "luxon";
import { DateTime } from 'luxon';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const DATA_DIR = path.resolve(__dirname, '../data/raw')
const transformedDataWriteStream = fs.createWriteStream(path.resolve(DATA_DIR, '..', 'transformed_dataset.jsonl'));
const DATA_DIR = path.resolve(__dirname, '../data/raw');
const transformedDataWriteStream = fs.createWriteStream(
path.resolve(DATA_DIR, '..', 'transformed_dataset.jsonl')
);

const dir = fs.opendirSync(DATA_DIR)
let dirent
const dir = fs.opendirSync(DATA_DIR);
let dirent;
while ((dirent = dir.readSync()) !== null) {
if (!dirent.name.endsWith('.html')) {
continue
continue;
}
console.log(`Transforming ${dirent.name}`)
const explainXkcdFileContents = fs.readFileSync(path.resolve(DATA_DIR, dirent.name)).toString();
console.log(`Transforming ${dirent.name}`);
const explainXkcdFileContents = fs
.readFileSync(path.resolve(DATA_DIR, dirent.name))
.toString();
const $ = cheerio.load(explainXkcdFileContents);
const [id, title] = $('#firstHeading').text().split(': ')
const [id, title] = $('#firstHeading').text().split(': ');

const xkcdInfoContents = fs.readFileSync(path.resolve(DATA_DIR, `${id}.json`)).toString();
let transcript = ''
const xkcdInfoContents = fs
.readFileSync(path.resolve(DATA_DIR, `${id}.json`))
.toString();
let transcript = '';

// Read all text in <dl> elements after h1#Transcript
let currentDomElement = $('#Transcript').parent().next();
while(currentDomElement.length > 0 && currentDomElement.prop('tagName') === 'DL') {
transcript += currentDomElement.text()
.replace(/^|\n\b.*?\b: /g, " ") // Remove Speaker Names that have the pattern "Speaker: " since it throws off relevancy
.replace(/\s*\[.*?\]\s*/g, '') // Remove explainers within [...] since it throws off relevancy
.trim()
+ ' ';
while (
currentDomElement.length > 0 &&
currentDomElement.prop('tagName') === 'DL'
) {
transcript +=
currentDomElement
.text()
.replace(/^|\n\b.*?\b: /g, ' ') // Remove Speaker Names that have the pattern "Speaker: " since it throws off relevancy
.replace(/\s*\[.*?\]\s*/g, '') // Remove explainers within [...] since it throws off relevancy
.trim() + ' ';
currentDomElement = currentDomElement.next();
}

let xkcdInfo
let xkcdInfo;
if (id === '404') {
xkcdInfo = {
img: 'https://www.explainxkcd.com/wiki/images/9/92/not_found.png',
month: '4',
year: '2008',
day: '1',
alt: '404 Not Found',
}
};
} else {
xkcdInfo = JSON.parse(xkcdInfoContents)
xkcdInfo = JSON.parse(xkcdInfoContents);
}

const altTitle = xkcdInfo['alt']
const publishDateObject = DateTime.local(parseInt(xkcdInfo['year']), parseInt(xkcdInfo['month']), parseInt(xkcdInfo['day']));
const publishDateYear = publishDateObject.year
const publishDateMonth = publishDateObject.month
const publishDateDay = publishDateObject.day
const publishDateTimestamp = publishDateObject.toSeconds()
const topics = $('#catlinks ul li a').toArray().map(e => e.firstChild.nodeValue).slice(4) // First 4 are not topics
const normalizedTopics = topics.map(t => t.replace(/^Comics featuring /g, ''))
const altTitle = xkcdInfo['alt'];
const publishDateObject = DateTime.local(
parseInt(xkcdInfo['year']),
parseInt(xkcdInfo['month']),
parseInt(xkcdInfo['day'])
);
const publishDateYear = publishDateObject.year;
const publishDateMonth = publishDateObject.month;
const publishDateDay = publishDateObject.day;
const publishDateTimestamp = publishDateObject.toSeconds();
const topics = $('#catlinks ul li a')
.toArray()
.map((e) => e.firstChild.nodeValue)
.slice(4); // First 4 are not topics
const normalizedTopics = topics.map((t) =>
t.replace(/^Comics featuring /g, '')
);

const record = {
id,
Expand All @@ -66,11 +85,10 @@ while ((dirent = dir.readSync()) !== null) {
publishDateDay,
publishDateTimestamp,
topics: normalizedTopics,
imageUrl: xkcdInfo['img']
}
transformedDataWriteStream.write(JSON.stringify(record) + "\n")
imageUrl: xkcdInfo['img'],
};
transformedDataWriteStream.write(JSON.stringify(record) + '\n');
}

transformedDataWriteStream.end()
dir.closeSync()

transformedDataWriteStream.end();
dir.closeSync();
52 changes: 52 additions & 0 deletions scripts/utils/network.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import 'dotenv/config';

const SCRAPE_REQUEST_BATCH_SIZE = parseInt(
process.env.SCRAPE_REQUEST_BATCH_SIZE || '52'
);

// https://www.codewithyou.com/blog/how-to-implement-retry-with-exponential-backoff-in-nodejs
export function exponentialBackoffRetry(
fn,
{ maxAttempts = 5, baseDelayMs = 1000, callback = () => {} }
) {
let attempt = 1;

const execute = async () => {
try {
return await fn();
} catch (error) {
if (attempt >= maxAttempts) {
throw error;
}

const delayMs = baseDelayMs * 2 ** attempt;
callback({ attempt, delayMs });
await new Promise((resolve) => setTimeout(resolve, delayMs));

attempt++;
return execute();
}
};

return execute();
}

export class BatchAPICall {
constructor(batchSize = SCRAPE_REQUEST_BATCH_SIZE) {
this.batchSize = batchSize;
}
requestList = [];

async makeRequests() {
if (this.requestList.length === 0) return console.log('No requests!');

for (let i = 0; i <= this.requestList.length / this.batchSize + 1; i++) {
const result = await Promise.all(
this.requestList
.slice((i === 0 ? 0 : i - 1) * this.batchSize, i * this.batchSize)
.map((fn) => fn())
);
console.log(result);
}
}
}
Loading

0 comments on commit 4d27518

Please sign in to comment.