Skip to content

Commit

Permalink
ttml, vtt
Browse files Browse the repository at this point in the history
  • Loading branch information
Laurian committed Dec 13, 2019
1 parent abe3b56 commit 4411d07
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 91 deletions.
10 changes: 7 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@

input/
debug/
output/
input/*
debug/*
output/*

!input/.keep
!debug/.keep
!output/.keep

# Created by https://www.gitignore.io/api/node,code,linux,macos,textmate
# Edit at https://www.gitignore.io/?templates=node,code,linux,macos,textmate
Expand Down
4 changes: 4 additions & 0 deletions .prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
singleQuote: true
trailingComma: es5
printWidth: 120
endOfLine: lf
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
.PHONY: output clean
.DEFAULT_GOAL := output

output: input/subs.info.json input/auto-subs.info.json
node src/convert

input/subs.info.json:
youtube-dl --continue --retries 5 --write-info-json \
Expand Down Expand Up @@ -40,9 +44,6 @@ input/auto-subs.info.json:
--output "input/auto-subs.%(ext)s" \
"https://www.youtube.com/watch?v=hB7aGnfLB-8"

output: input/subs.info.json input/auto-subs.info.json
node src/convert

clean:
rm -f ./input/*
rm -f ./debug/*
Expand Down
Empty file added debug/.keep
Empty file.
Empty file added input/.keep
Empty file.
Empty file added output/.keep
Empty file.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"license": "ISC",
"dependencies": {
"shortid": "^2.2.15",
"vtt.js": "^0.13.0",
"xml2js": "^0.4.22"
}
}
228 changes: 143 additions & 85 deletions src/convert.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
const fs = require("fs");
const xml2js = require("xml2js");
const shortid = require("shortid");
const fs = require('fs');
const xml2js = require('xml2js');
const vtt = require('vtt.js');
const shortid = require('shortid');

const { id: ytid, subtitles } = require("../input/subs.info.json");
const { automatic_captions } = require("../input/auto-subs.info.json");
const { id: ytid, subtitles } = require('../input/subs.info.json');
const { automatic_captions } = require('../input/auto-subs.info.json');

const DEBUG = true;

Expand All @@ -16,10 +17,32 @@ const generateID = () => {
return id;
};

const moreMagic = paragraphs =>
paragraphs.reverse().reduce((acc, p) => {
if (!p.end && acc.length > 0) p.end = acc[0].start;
if (!p.words || p.words.length === 0) {
p.words = p.text.split(' ').reduce(
(acc, text, index, words) => [
...acc,
{
id: generateID(),
start: p.start + Math.floor((index * (p.end - p.start)) / words.length),
end: p.start + Math.floor(((index + 1) * (p.end - p.start)) / words.length),
offset: index === 0 ? 0 : acc.map(({ text }) => text).join(' ').length + 1,
length: text.length,
text,
},
],
[]
);
}
return [p, ...acc];
}, []);

const main = () => {
const queue = [
["subs", subtitles],
["auto-subs", automatic_captions]
['subs', subtitles],
['auto-subs', automatic_captions],
]
.reduce(
(acc, [type, subs]) => [
Expand All @@ -31,128 +54,163 @@ const main = () => {
lang,
ext,
type,
file: `${type}.${lang}.${ext}`
}))
file: `${type}.${lang}.${ext}`,
})),
],
[]
)
),
],
[]
)
.filter(({ file }) => fs.existsSync(`./input/${file}`));
// .filter(({ lang }) => lang === "ro");
// .filter(({ lang }) => lang === 'ro');
// .slice(0, 1);

console.log(JSON.stringify(queue, null, 2));
DEBUG && console.log(JSON.stringify(queue, null, 2));

queue.forEach(({ lang, type, ext, file }) => {
switch (ext) {
case "srv3":
case 'srv3':
convertSRV3(file, lang, type);
break;

case 'ttml':
convertTTML(file, lang, type);
break;
case 'vtt':
convertVTT(file, lang, type);
break;
default:
console.warn(`Unknown format ${ext}`);
break;
}
});
};

const convertSRV3 = async (file, lang, type) => {
const data = await xml2js.parseStringPromise(
fs.readFileSync(`./input/${file}`, { encoding: "utf-8" }),
const convertVTT = async (file, lang, type) => {
global.navigator = { userAgent: '' };
const cues = [];

const parser = new vtt.WebVTT.Parser(
{
attrkey: "attrs",
charkey: "text",
trim: true,
explicitArray: true
}
VTTCue: vtt.VTTCue,
VTTRegion: vtt.VTTRegion,
},
vtt.WebVTT.StringDecoder()
);

DEBUG &&
fs.writeFileSync(
`./debug/${file}.parsed.json`,
JSON.stringify(data, null, 2),
"utf8"
);
parser.oncue = cue => cues.push(cue);

const paragraphs = data.timedtext.body[0].p
.filter(({ s, text }) => !!s || !!text)
.map(({ attrs: { t }, s = [], text }) => {
const start = parseInt(t);
parser.parse(fs.readFileSync(`./input/${file}`, { encoding: 'utf-8' }));
parser.flush();

DEBUG && fs.writeFileSync(`./debug/${file}.parsed.json`, JSON.stringify(cues, null, 2), 'utf8');

const paragraphs = moreMagic(
cues.map(({ startTime, endTime, text }) => {
const start = parseFloat(startTime) * 1e3;

return {
id: generateID(),
start,
text: text.trim(),
};
})
);

DEBUG && fs.writeFileSync(`./debug/${file}.paragraphs.json`, JSON.stringify(paragraphs, null, 2), 'utf8');

const transcript = {
id: generateID(),
lang,
paragraphs,
};

fs.writeFileSync(`./output/${file}.json`, JSON.stringify(transcript, null, 2), 'utf8');
};

const convertTTML = async (file, lang, type) => {
const data = await xml2js.parseStringPromise(fs.readFileSync(`./input/${file}`, { encoding: 'utf-8' }), {
attrkey: 'attrs',
charkey: 'text',
trim: true,
explicitArray: true,
});

DEBUG && fs.writeFileSync(`./debug/${file}.parsed.json`, JSON.stringify(data, null, 2), 'utf8');

const paragraphs = moreMagic(
data.tt.body[0].div[0].p.map(({ attrs: { begin }, text }) => {
const [hh, mm, ss] = begin.split(':');
const start = (hh * 3600 + mm * 60 + ss) * 1e3;

const words = s.reduce(
(acc, { attrs: { t = 0, ac = 0 }, text }, index) => [
...acc,
{
id: generateID(),
start: start + parseInt(t),
end: start + parseInt(t) + parseInt(ac),
offset:
index === 0
? 0
: acc.map(({ text }) => text).join(" ").length + 1,
length: text.length,
text
}
],
[]
);
return {
id: generateID(),
start,
end: words.length > 0 ? words[words.length - 1].end : null,
text: text ? text : words.map(({ text }) => text).join(" "),
words
text,
};
})
.reverse()
.reduce((acc, p) => {
if (!p.end && acc.length > 0) p.end = acc[0].start;
if (!p.words || p.words.length === 0) {
p.words = p.text.split(" ").reduce(
(acc, text, index, words) => [
);

DEBUG && fs.writeFileSync(`./debug/${file}.paragraphs.json`, JSON.stringify(paragraphs, null, 2), 'utf8');

const transcript = {
id: generateID(),
lang,
paragraphs,
};

fs.writeFileSync(`./output/${file}.json`, JSON.stringify(transcript, null, 2), 'utf8');
};

const convertSRV3 = async (file, lang, type) => {
const data = await xml2js.parseStringPromise(fs.readFileSync(`./input/${file}`, { encoding: 'utf-8' }), {
attrkey: 'attrs',
charkey: 'text',
trim: true,
explicitArray: true,
});

DEBUG && fs.writeFileSync(`./debug/${file}.parsed.json`, JSON.stringify(data, null, 2), 'utf8');

const paragraphs = moreMagic(
data.timedtext.body[0].p
.filter(({ s, text }) => !!s || !!text)
.map(({ attrs: { t }, s = [], text }) => {
const start = parseInt(t);

const words = s.reduce(
(acc, { attrs: { t = 0, ac = 0 }, text }, index) => [
...acc,
{
id: generateID(),
start:
p.start +
Math.floor((index * (p.end - p.start)) / words.length),
end:
p.start +
Math.floor(((index + 1) * (p.end - p.start)) / words.length),
offset:
index === 0
? 0
: acc.map(({ text }) => text).join(" ").length + 1,
start: start + parseInt(t),
end: start + parseInt(t) + parseInt(ac),
offset: index === 0 ? 0 : acc.map(({ text }) => text).join(' ').length + 1,
length: text.length,
text
}
text,
},
],
[]
);
}
return [p, ...acc];
}, []);
return {
id: generateID(),
start,
end: words.length > 0 ? words[words.length - 1].end : null,
text: text ? text : words.map(({ text }) => text).join(' '),
words,
};
})
);

DEBUG &&
fs.writeFileSync(
`./debug/${file}.paragraphs.json`,
JSON.stringify(paragraphs, null, 2),
"utf8"
);
DEBUG && fs.writeFileSync(`./debug/${file}.paragraphs.json`, JSON.stringify(paragraphs, null, 2), 'utf8');

const transcript = {
id: generateID(),
lang,
paragraphs
paragraphs,
};

fs.writeFileSync(
`./output/${file}.json`,
JSON.stringify(transcript, null, 2),
"utf8"
);
fs.writeFileSync(`./output/${file}.json`, JSON.stringify(transcript, null, 2), 'utf8');
};

main();
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ util.promisify@~1.0.0:
define-properties "^1.1.2"
object.getownpropertydescriptors "^2.0.3"

vtt.js@^0.13.0:
version "0.13.0"
resolved "https://registry.yarnpkg.com/vtt.js/-/vtt.js-0.13.0.tgz#955c667b34d5325b2012cb9e8ba9bad6e0b11ff8"
integrity sha1-lVxmezTVMlsgEsuei6m61uCxH/g=

xml2js@^0.4.22:
version "0.4.22"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.22.tgz#4fa2d846ec803237de86f30aa9b5f70b6600de02"
Expand Down

0 comments on commit 4411d07

Please sign in to comment.