From abe3b5626a15f1dfdc7ca9f187306b3c93368825 Mon Sep 17 00:00:00 2001 From: Laurian Gridinoc Date: Thu, 12 Dec 2019 19:07:25 -0300 Subject: [PATCH] srv3 --- .editorconfig | 20 +++++++ .gitignore | 158 +++++++++++++++++++++++++++++++++++++++++++++++++ .nvmrc | 1 + Makefile | 49 +++++++++++++++ package.json | 15 +++++ src/convert.js | 158 +++++++++++++++++++++++++++++++++++++++++++++++++ yarn.lock | 149 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 550 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .nvmrc create mode 100644 Makefile create mode 100644 package.json create mode 100644 src/convert.js create mode 100644 yarn.lock diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..8573622 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +root = true + +[*] +indent_style = space +indent_size = 2 + +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +trim_trailing_whitespace = false + +[Dockerfile] +indent_size = 4 + +[Makefile] +indent_style = tab +indent_size = 2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..de28eff --- /dev/null +++ b/.gitignore @@ -0,0 +1,158 @@ + +input/ +debug/ +output/ + +# Created by https://www.gitignore.io/api/node,code,linux,macos,textmate +# Edit at https://www.gitignore.io/?templates=node,code,linux,macos,textmate + +### Code ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### Node ### +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# TypeScript v1 declaration files +typings/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.env.test + +# parcel-bundler cache (https://parceljs.org/) +.cache + +# next.js build output +.next + +# nuxt.js build output +.nuxt + +# react / gatsby +public/ + +# vuepress build output +.vuepress/dist + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +### TextMate ### +*.tmproj +*.tmproject +tmtags + +# End of https://www.gitignore.io/api/node,code,linux,macos,textmate + diff --git a/.nvmrc b/.nvmrc new file mode 100644 index 0000000..e1fcd1e --- /dev/null +++ b/.nvmrc @@ -0,0 +1 @@ +lts/erbium diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0c2336f --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +.PHONY: output clean + +input/subs.info.json: + youtube-dl --continue --retries 5 --write-info-json \ + --no-check-certificate --ignore-errors \ + --no-overwrites --skip-download \ + --all-subs --sub-format srv3/ttml/vtt \ + --output "input/subs.%(ext)s" \ + "https://www.youtube.com/watch?v=hB7aGnfLB-8" + youtube-dl --continue --retries 5 --write-info-json \ + --no-check-certificate --ignore-errors \ + --no-overwrites --skip-download \ + --all-subs --sub-format ttml \ + --output "input/subs.%(ext)s" \ + "https://www.youtube.com/watch?v=hB7aGnfLB-8" + youtube-dl --continue --retries 5 --write-info-json \ + --no-check-certificate --ignore-errors \ + --no-overwrites --skip-download \ + --all-subs --sub-format vtt \ + --output "input/subs.%(ext)s" \ + "https://www.youtube.com/watch?v=hB7aGnfLB-8" + +input/auto-subs.info.json: + youtube-dl --continue --retries 5 --write-info-json \ + --no-check-certificate --ignore-errors \ + --no-overwrites --skip-download \ + --all-subs --sub-format srv3/ttml/vtt --write-auto-sub \ + --output "input/auto-subs.%(ext)s" \ + "https://www.youtube.com/watch?v=hB7aGnfLB-8" + youtube-dl --continue --retries 5 --write-info-json \ + --no-check-certificate --ignore-errors \ + --no-overwrites --skip-download \ + --all-subs --sub-format ttml --write-auto-sub \ + --output "input/auto-subs.%(ext)s" \ + "https://www.youtube.com/watch?v=hB7aGnfLB-8" + youtube-dl --continue --retries 5 --write-info-json \ + --no-check-certificate --ignore-errors \ + --no-overwrites --skip-download \ + --all-subs --sub-format vtt --write-auto-sub \ + --output "input/auto-subs.%(ext)s" \ + "https://www.youtube.com/watch?v=hB7aGnfLB-8" + +output: input/subs.info.json input/auto-subs.info.json + node src/convert + +clean: + rm -f ./input/* + rm -f ./debug/* + rm -f ./output/* diff --git a/package.json b/package.json new file mode 100644 index 0000000..0f7639b --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "ytdl2transcript", + "version": "1.0.0", + "description": "", + "main": "src/index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "", + "license": "ISC", + "dependencies": { + "shortid": "^2.2.15", + "xml2js": "^0.4.22" + } +} diff --git a/src/convert.js b/src/convert.js new file mode 100644 index 0000000..82d10b0 --- /dev/null +++ b/src/convert.js @@ -0,0 +1,158 @@ +const fs = require("fs"); +const xml2js = require("xml2js"); +const shortid = require("shortid"); + +const { id: ytid, subtitles } = require("../input/subs.info.json"); +const { automatic_captions } = require("../input/auto-subs.info.json"); + +const DEBUG = true; + +const generateID = () => { + let id = null; + do { + id = shortid.generate(); + } while (!id.match(/^[a-z]([0-9]|[a-z])+([0-9a-z]+)[a-z]$/i)); + + return id; +}; + +const main = () => { + const queue = [ + ["subs", subtitles], + ["auto-subs", automatic_captions] + ] + .reduce( + (acc, [type, subs]) => [ + ...acc, + ...Object.entries(subs).reduce( + (acc, [lang, formats]) => [ + ...acc, + ...formats.map(({ ext }) => ({ + lang, + ext, + type, + file: `${type}.${lang}.${ext}` + })) + ], + [] + ) + ], + [] + ) + .filter(({ file }) => fs.existsSync(`./input/${file}`)); + // .filter(({ lang }) => lang === "ro"); + // .slice(0, 1); + + console.log(JSON.stringify(queue, null, 2)); + + queue.forEach(({ lang, type, ext, file }) => { + switch (ext) { + case "srv3": + convertSRV3(file, lang, type); + break; + + default: + console.warn(`Unknown format ${ext}`); + break; + } + }); +}; + +const convertSRV3 = async (file, lang, type) => { + const data = await xml2js.parseStringPromise( + fs.readFileSync(`./input/${file}`, { encoding: "utf-8" }), + { + attrkey: "attrs", + charkey: "text", + trim: true, + explicitArray: true + } + ); + + DEBUG && + fs.writeFileSync( + `./debug/${file}.parsed.json`, + JSON.stringify(data, null, 2), + "utf8" + ); + + const paragraphs = data.timedtext.body[0].p + .filter(({ s, text }) => !!s || !!text) + .map(({ attrs: { t }, s = [], text }) => { + const start = parseInt(t); + + const words = s.reduce( + (acc, { attrs: { t = 0, ac = 0 }, text }, index) => [ + ...acc, + { + id: generateID(), + start: start + parseInt(t), + end: start + parseInt(t) + parseInt(ac), + offset: + index === 0 + ? 0 + : acc.map(({ text }) => text).join(" ").length + 1, + length: text.length, + text + } + ], + [] + ); + return { + id: generateID(), + start, + end: words.length > 0 ? words[words.length - 1].end : null, + text: text ? text : words.map(({ text }) => text).join(" "), + words + }; + }) + .reverse() + .reduce((acc, p) => { + if (!p.end && acc.length > 0) p.end = acc[0].start; + if (!p.words || p.words.length === 0) { + p.words = p.text.split(" ").reduce( + (acc, text, index, words) => [ + ...acc, + { + id: generateID(), + start: + p.start + + Math.floor((index * (p.end - p.start)) / words.length), + end: + p.start + + Math.floor(((index + 1) * (p.end - p.start)) / words.length), + offset: + index === 0 + ? 0 + : acc.map(({ text }) => text).join(" ").length + 1, + length: text.length, + text + } + ], + [] + ); + } + return [p, ...acc]; + }, []); + + DEBUG && + fs.writeFileSync( + `./debug/${file}.paragraphs.json`, + JSON.stringify(paragraphs, null, 2), + "utf8" + ); + + const transcript = { + id: generateID(), + lang, + paragraphs + }; + + fs.writeFileSync( + `./output/${file}.json`, + JSON.stringify(transcript, null, 2), + "utf8" + ); +}; + +main(); diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 0000000..7271d35 --- /dev/null +++ b/yarn.lock @@ -0,0 +1,149 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +define-properties@^1.1.2, define-properties@^1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/define-properties/-/define-properties-1.1.3.tgz#cf88da6cbee26fe6db7094f61d870cbd84cee9f1" + integrity sha512-3MqfYKj2lLzdMSf8ZIZE/V+Zuy+BgD6f164e8K2w7dgnpKArBDerGYpM46IYYcjnkdPNMjPk9A6VFB8+3SKlXQ== + dependencies: + object-keys "^1.0.12" + +es-abstract@^1.5.1: + version "1.16.3" + resolved "https://registry.yarnpkg.com/es-abstract/-/es-abstract-1.16.3.tgz#52490d978f96ff9f89ec15b5cf244304a5bca161" + integrity sha512-WtY7Fx5LiOnSYgF5eg/1T+GONaGmpvpPdCpSnYij+U2gDTL0UPfWrhDw7b2IYb+9NQJsYpCA0wOQvZfsd6YwRw== + dependencies: + es-to-primitive "^1.2.1" + function-bind "^1.1.1" + has "^1.0.3" + has-symbols "^1.0.1" + is-callable "^1.1.4" + is-regex "^1.0.4" + object-inspect "^1.7.0" + object-keys "^1.1.1" + string.prototype.trimleft "^2.1.0" + string.prototype.trimright "^2.1.0" + +es-to-primitive@^1.2.1: + version "1.2.1" + resolved "https://registry.yarnpkg.com/es-to-primitive/-/es-to-primitive-1.2.1.tgz#e55cd4c9cdc188bcefb03b366c736323fc5c898a" + integrity sha512-QCOllgZJtaUo9miYBcLChTUaHNjJF3PYs1VidD7AwiEj1kYxKeQTctLAezAOH5ZKRH0g2IgPn6KwB4IT8iRpvA== + dependencies: + is-callable "^1.1.4" + is-date-object "^1.0.1" + is-symbol "^1.0.2" + +function-bind@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.1.tgz#a56899d3ea3c9bab874bb9773b7c5ede92f4895d" + integrity sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A== + +has-symbols@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/has-symbols/-/has-symbols-1.0.1.tgz#9f5214758a44196c406d9bd76cebf81ec2dd31e8" + integrity sha512-PLcsoqu++dmEIZB+6totNFKq/7Do+Z0u4oT0zKOJNl3lYK6vGwwu2hjHs+68OEZbTjiUE9bgOABXbP/GvrS0Kg== + +has@^1.0.1, has@^1.0.3: + version "1.0.3" + resolved "https://registry.yarnpkg.com/has/-/has-1.0.3.tgz#722d7cbfc1f6aa8241f16dd814e011e1f41e8796" + integrity sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw== + dependencies: + function-bind "^1.1.1" + +is-callable@^1.1.4: + version "1.1.4" + resolved "https://registry.yarnpkg.com/is-callable/-/is-callable-1.1.4.tgz#1e1adf219e1eeb684d691f9d6a05ff0d30a24d75" + integrity sha512-r5p9sxJjYnArLjObpjA4xu5EKI3CuKHkJXMhT7kwbpUyIFD1n5PMAsoPvWnvtZiNz7LjkYDRZhd7FlI0eMijEA== + +is-date-object@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/is-date-object/-/is-date-object-1.0.1.tgz#9aa20eb6aeebbff77fbd33e74ca01b33581d3a16" + integrity sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY= + +is-regex@^1.0.4: + version "1.0.4" + resolved "https://registry.yarnpkg.com/is-regex/-/is-regex-1.0.4.tgz#5517489b547091b0930e095654ced25ee97e9491" + integrity sha1-VRdIm1RwkbCTDglWVM7SXul+lJE= + dependencies: + has "^1.0.1" + +is-symbol@^1.0.2: + version "1.0.3" + resolved "https://registry.yarnpkg.com/is-symbol/-/is-symbol-1.0.3.tgz#38e1014b9e6329be0de9d24a414fd7441ec61937" + integrity sha512-OwijhaRSgqvhm/0ZdAcXNZt9lYdKFpcRDT5ULUuYXPoT794UNOdU+gpT6Rzo7b4V2HUl/op6GqY894AZwv9faQ== + dependencies: + has-symbols "^1.0.1" + +nanoid@^2.1.0: + version "2.1.7" + resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-2.1.7.tgz#d775e3e7c6470bbaaae3da9a647a80e228e0abf7" + integrity sha512-fmS3qwDldm4bE01HCIRqNk+f255CNjnAoeV3Zzzv0KemObHKqYgirVaZA9DtKcjogicWjYcHkJs4D5A8CjnuVQ== + +object-inspect@^1.7.0: + version "1.7.0" + resolved "https://registry.yarnpkg.com/object-inspect/-/object-inspect-1.7.0.tgz#f4f6bd181ad77f006b5ece60bd0b6f398ff74a67" + integrity sha512-a7pEHdh1xKIAgTySUGgLMx/xwDZskN1Ud6egYYN3EdRW4ZMPNEDUTF+hwy2LUC+Bl+SyLXANnwz/jyh/qutKUw== + +object-keys@^1.0.12, object-keys@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/object-keys/-/object-keys-1.1.1.tgz#1c47f272df277f3b1daf061677d9c82e2322c60e" + integrity sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA== + +object.getownpropertydescriptors@^2.0.3: + version "2.0.3" + resolved "https://registry.yarnpkg.com/object.getownpropertydescriptors/-/object.getownpropertydescriptors-2.0.3.tgz#8758c846f5b407adab0f236e0986f14b051caa16" + integrity sha1-h1jIRvW0B62rDyNuCYbxSwUcqhY= + dependencies: + define-properties "^1.1.2" + es-abstract "^1.5.1" + +sax@>=0.6.0: + version "1.2.4" + resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" + integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== + +shortid@^2.2.15: + version "2.2.15" + resolved "https://registry.yarnpkg.com/shortid/-/shortid-2.2.15.tgz#2b902eaa93a69b11120373cd42a1f1fe4437c122" + integrity sha512-5EaCy2mx2Jgc/Fdn9uuDuNIIfWBpzY4XIlhoqtXF6qsf+/+SGZ+FxDdX/ZsMZiWupIWNqAEmiNY4RC+LSmCeOw== + dependencies: + nanoid "^2.1.0" + +string.prototype.trimleft@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/string.prototype.trimleft/-/string.prototype.trimleft-2.1.0.tgz#6cc47f0d7eb8d62b0f3701611715a3954591d634" + integrity sha512-FJ6b7EgdKxxbDxc79cOlok6Afd++TTs5szo+zJTUyow3ycrRfJVE2pq3vcN53XexvKZu/DJMDfeI/qMiZTrjTw== + dependencies: + define-properties "^1.1.3" + function-bind "^1.1.1" + +string.prototype.trimright@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/string.prototype.trimright/-/string.prototype.trimright-2.1.0.tgz#669d164be9df9b6f7559fa8e89945b168a5a6c58" + integrity sha512-fXZTSV55dNBwv16uw+hh5jkghxSnc5oHq+5K/gXgizHwAvMetdAJlHqqoFC1FSDVPYWLkAKl2cxpUT41sV7nSg== + dependencies: + define-properties "^1.1.3" + function-bind "^1.1.1" + +util.promisify@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/util.promisify/-/util.promisify-1.0.0.tgz#440f7165a459c9a16dc145eb8e72f35687097030" + integrity sha512-i+6qA2MPhvoKLuxnJNpXAGhg7HphQOSUq2LKMZD0m15EiskXUkMvKdF4Uui0WYeCUGea+o2cw/ZuwehtfsrNkA== + dependencies: + define-properties "^1.1.2" + object.getownpropertydescriptors "^2.0.3" + +xml2js@^0.4.22: + version "0.4.22" + resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.22.tgz#4fa2d846ec803237de86f30aa9b5f70b6600de02" + integrity sha512-MWTbxAQqclRSTnehWWe5nMKzI3VmJ8ltiJEco8akcC6j3miOhjjfzKum5sId+CWhfxdOs/1xauYr8/ZDBtQiRw== + dependencies: + sax ">=0.6.0" + util.promisify "~1.0.0" + xmlbuilder "~11.0.0" + +xmlbuilder@~11.0.0: + version "11.0.1" + resolved "https://registry.yarnpkg.com/xmlbuilder/-/xmlbuilder-11.0.1.tgz#be9bae1c8a046e76b31127726347d0ad7002beb3" + integrity sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==