-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutils.mjs
126 lines (115 loc) · 3.91 KB
/
utils.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
Various utilities for blog to ebook converter software
Copyright (C) 2022 Akshay S Dinesh
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Find Akshay's contact details at https://asd.learnlearn.in/about/#contact
*/
import crypto from "crypto";
import xmlserializer from "xmlserializer";
import fs from "fs";
import { JSDOM } from "jsdom";
const CACHEDIR = ".cache";
fs.mkdirSync(CACHEDIR, { recursive: true });
export const keyToHash = (key) =>
crypto.createHash("md5").update(key).digest("hex");
const keyToFilePath = (key) => `${CACHEDIR}/${keyToHash(key)}`;
export const getCache = (key) => {
try {
return JSON.parse(fs.readFileSync(keyToFilePath(key)));
} catch (err) {
return undefined;
}
};
export const setCache = (key, data) => {
fs.writeFileSync(keyToFilePath(key), JSON.stringify(data));
};
export const createId = (seed) => `id-${keyToHash(seed)}`;
export const escapeHtml = (html) =>
html
// https://stackoverflow.com/a/1091953
// says that only < & & needs replacement
// because >, ', and " are allowed in text
.replace(/</g, "<")
.replace(/&/g, "&");
const fetchUrlText = async (url) => {
const cached = getCache(url);
if (cached) {
console.log(`Got ${url} from cache`);
return cached;
}
console.log(`Downloading text ${url} ...`);
return fetch(url).then(async (res) => {
const text = await res.text();
setCache(url, text);
return text;
});
};
export const urlToDom = (url) =>
fetchUrlText(url).then((text) => new JSDOM(text, { url }));
const swapResources = async (dom) => {
const resources = {};
const nodelist = dom.querySelectorAll("img");
for (const node of nodelist) {
const href = node.src;
if (resources.hasOwnProperty(href)) node.src = `./${resources[href].id}`;
if (href.startsWith("data:image")) {
// https://github.com/DiegoZoracKy/image-data-uri/blob/c4e7fb976283362cd3b8f309d413c99ebef167bd/lib/image-data-uri.js#L24
const matches = href.match("data:(image/.*);base64,(.*)");
resources[href] = {
id: createId(href),
mediaType: matches[1],
content: new Buffer(matches[2], "base64"),
};
} else {
console.log(`Downloading image: ${href}`);
try {
const res = await fetch(href);
const blob = await res.arrayBuffer();
const mediaType = res.headers.get("content-type");
resources[href] = {
id: createId(href),
mediaType,
content: Buffer.from(blob),
};
} catch {
console.error(`Failed to download. Skipping...`);
}
}
if (resources[href]) {
node.src = `./${resources[href]?.id}`;
node.srcset = "";
}
}
return { dom, resources: Object.values(resources) };
};
export async function* processBlogPosts(blogPosts) {
const chapters = [];
for await (const { title, bodyDom, id, url } of blogPosts) {
const escapedTitle = escapeHtml(title);
const { resources, dom: domSwappedWithLocalImages } = await swapResources(
bodyDom
);
const contentHtml = xmlserializer.serializeToString(
domSwappedWithLocalImages
);
yield {
id,
title: escapedTitle,
content: `<div>
<h1>${escapedTitle}</h1>
<div><p><a href="${url}">Link to original</a></p></div>
<div>${contentHtml}</div>
</div>`,
resources,
};
}
}