Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .babelrc
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"presets": [
["env", { "targets": { "node": "6" } }],
"flow",
["env", { "targets": { "node": "22" } }],
"stage-0"
]
}
11 changes: 0 additions & 11 deletions .flowconfig

This file was deleted.

7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
node_modules
dist
.vscode

*.log
.DS_Store

# CLI output
*.captions.json

bun.lock
pnpm-lock.yaml
32 changes: 32 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
# v2.2.2

* replaced he with custom HTML excaped decoder for smaller package size
* moved test demos into their own folder

# v2.2.1

* removed eslint

# v2.2.0

* added upstream change; allow JS `fetch` to be used if avalable
* removed flow & prettier, updated other deps
* removed ava test; `dist-demo.js` has a default test

# v2.1.0

* fixed YouTube API URL

# v2.0.0

* BREAKING CHANGE: output includes videoID, language code, & datetime stamp of when script was ran
* BREAKING CHANGE: timeed text output has shortend key names for **s**tart, **d**uration, & **t**ext to save a few k.
* added CLI script: `node cli VIDEO-ID` to run, outputs in home dir as `VIDEO-ID.captions.json`.
* dropped lodash
* updated axios

# v1.1.0

* leave most HTML tags except `<font>` which is usally spammy.


# v1.0.1

* strip HTML tags from captions
50 changes: 40 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@

## Installation

* `> npm install -S youtube-captions-scraper` OR
* `> yarn add youtube-captions-scraper`
* `npm install -S youtube-captions-scraper` OR
* `yarn add youtube-captions-scraper`

## Usage

```js
// ES6 / TypeScript
import { getSubtitles } from 'youtube-captions-scraper';
import { getSubtitles } from 'youtube-captions-scraper'

getSubtitles({
videoID: 'XXXXX', // youtube video id
lang: 'fr' // default: `en`
}).then(captions => {
console.log(captions);
console.log(captions)
});

// ES5
Expand All @@ -31,12 +31,42 @@ getSubtitles({
});
```

Captions will be an array of object of this format:
### Output

**v2 change**: Captions will be an array of object of this format:

```js
{
"start": Number,
"dur": Number,
"text": String
}
"videoID": "9W0Dy1nM-zU",
"lang": "en",
"datePull": 1587948820739,
"timedtext": [
{
"s": 11.5,
"d": 4,
"t": "<b>Bold</b>"
},
etc...
```
Key:
* videoID = YouTube's unique id after `v=` in URL querry string
* lang = language code of captions
* datePull = [seconds stamp](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Date/now) when the captions was pulled & processed
* timedtext array = **s**tart time, **d**uration, & **t**ext


### CLI

```bash
node cli VIDEO-ID
```
Outputs in home dir as `VIDEO-ID.captions.json`.
The `.gitignore` prevents you from saving the output in your git repo.

### Demo

```bash
npm run demo
or
bun run demo
```
Output should be JSON, with some 'text lines' (`t`) including a some minor HTML markup, but no `<font>` tags.
34 changes: 34 additions & 0 deletions cli.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
const getSubtitles = require('./dist').getSubtitles
// import ytttjs from "./index.mjs"
// const getSubtitles = require('./dist').getSubtitles
const fs = require('fs')
// import path from 'path'
let arg = process.argv[2] // should include path
console.log(arg)

if (!arg) throw "Needs input data!"
switch (arg){

case '-h':
case '--help':
console.log(`First arg: YouTube video ID number
Output will be 'ARG.captions.json'
`)
break

default:
console.log(`Attempting to grab captions for `+ arg)

let res = {}
getSubtitles({
videoID: arg,
lang: 'en' // default: `en`
}).then(captions => {
res = captions
console.log(res)

const outputPath = __dirname +'/'+ arg +'.captions.json'
console.log(`saving to: `+ outputPath)
fs.writeFileSync(outputPath, JSON.stringify(res))

Check failure on line 32 in cli.js

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

cli.js#L32

The application dynamically constructs file or path information.
})
}
70 changes: 70 additions & 0 deletions demos/dist-demo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// build first with `yearn build` or `npx build`, then run with `node dist-demo`
const getSubtitles = require('../dist/index.js').getSubtitles;

getSubtitles({
videoID: '9W0Dy1nM-zU',
lang: 'en' // default: `en`
}).then(captions => {
console.log(captions);
});

/*
You should see this:

[ { start: '11.5', dur: '4', text: '<b>Bold</b>' },
{ start: '11.5', dur: '4', text: '<i>Italic</i>' },
{ start: '11.5', dur: '4', text: '<u>Underline</u>' },
{ start: '11.5',
dur: '4',
text: '<b>Bold </b><i>Italic </i><u>Underline</u>' },
{ start: '15.5', dur: '4', text: 'Red (a = 40)' },
{ start: '15.5', dur: '4', text: 'Green (a = 127)' },
{ start: '15.5', dur: '4', text: 'Blue (a = 255)' },
{ start: '15.5', dur: '4', text: 'Red Green Blue' },
{ start: '19.5', dur: '4', text: 'Red (a = 40)' },
{ start: '19.5', dur: '4', text: 'Green (a = 127)' },
{ start: '19.5', dur: '4', text: 'Blue (a = 255)' },
{ start: '19.5', dur: '4', text: 'Opaque' },
{ start: '19.5', dur: '4', text: 'Red Green Blue' },
{ start: '23.5', dur: '4', text: 'Edge type 1' },
{ start: '23.5', dur: '4', text: 'Edge type 2' },
{ start: '23.5', dur: '4', text: 'Edge type 3' },
{ start: '23.5', dur: '4', text: 'Edge type 4' },
{ start: '23.5', dur: '4', text: 'One Two Three Four' },
{ start: '27.5', dur: '6', text: 'Font 0' },
{ start: '27.5', dur: '6', text: 'Font 1' },
{ start: '27.5', dur: '6', text: 'Font 2' },
{ start: '27.5', dur: '6', text: 'Font 3' },
{ start: '27.5', dur: '6', text: 'Font 4' },
{ start: '27.5', dur: '6', text: 'Font 5' },
{ start: '27.5', dur: '6', text: 'Font 6' },
{ start: '27.5', dur: '6', text: 'Font 7' },
{ start: '33.5',
dur: '4',
text: 'Zero One Two Three Four Five Six Seven' },
{ start: '37.5', dur: '4', text: '30%' },
{ start: '37.5', dur: '4', text: '100%' },
{ start: '37.5', dur: '4', text: '300%' },
{ start: '37.5', dur: '4', text: '30% 100% 300%' },
{ start: '41.5', dur: '4', text: 'Top left' },
{ start: '41.5', dur: '4', text: 'Top center' },
{ start: '41.5', dur: '4', text: 'Top right' },
{ start: '41.5', dur: '4', text: 'Middle left' },
{ start: '41.5', dur: '4', text: 'Middle center' },
{ start: '41.5', dur: '4', text: 'Middle right' },
{ start: '41.5', dur: '4', text: 'Bottom left' },
{ start: '41.5', dur: '4', text: 'Bottom center' },
{ start: '41.5', dur: '4', text: 'Bottom right' },
{ start: '45.5', dur: '4', text: 'Left-\naligned line' },
{ start: '45.5', dur: '4', text: 'Centered\nline' },
{ start: '45.5', dur: '4', text: 'Right-\naligned line' },
{ start: '49.5', dur: '4', text: 'Karaoke' },
{ start: '53.5', dur: '2', text: 'Line break\nat start of span' },
{ start: '53.5',
dur: '2',
text: 'Line break\nin middle of span' },
{ start: '53.5', dur: '2', text: 'Line break\nat end of span' } ]

other youtube video ids:
ziGD7vQOwl8 yjhANyrKpv8 TomOQYxFnrU nm_xCuQ5Szw rmeMDahEi1Q Je8FjYtYDf0 w82a1FT5o88 mrX2eH_FL4E FlTG0UXRAkE fn-Qb-YQqo4 t6bbuDUPIgk t6bbuDUPIgk yUx_HI8D-4w
*/
20 changes: 20 additions & 0 deletions demos/unescapeHTMLentities.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
export const unescapeHTMLentities = (str) => {
const reEscape = /&(?:apos|#39|quot|#34|lt|#60|gt|#62|amp|#38);/g
return str.replace(reEscape, (tag)=>({
'&apos;':"'",
'&#39;':"'",
'&quot;':'"',
'&#34;':'"',
'&lt;':'<',
'&#60;':'<',
'&gt;':'>',
'&#62;':'>',
'&amp;':'&',
'&#38;':'&',
})[tag]
)
}

console.log('&#60;div title=&#34;text&quot;&#62;1 &amp; &apos;2&#39;&lt;/div&gt;')
console.log(unescapeHTMLentities('&#60;div title=&#34;text&quot;&#62;1 &amp; &apos;2&#39;&lt;/div&gt;'))
// <div title="text">1 & 2</div>
54 changes: 17 additions & 37 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,56 +1,36 @@
{
"name": "youtube-captions-scraper",
"version": "1.0.3",
"description": "Scrap youtube auto-generated captions",
"version": "2.2.2",
"description": "Scrape YouTube auto-generated captions",
"main": "dist/index.js",
"author": {
"name": "Algolia, Inc.",
"url": "https://www.algolia.com"
},
"contributors": [
{
"name": "tom byrer",
"email": "[email protected]",
"url": "https://github.com/tomByrer"
}
],
"repository": {
"type": "git",
"url": "https://github.com/algolia/youtube-captions-scraper.git"
},
"homepage": "https://github.com/algolia/youtube-captions-scraper",
"bugs": {
"url": "https://github.com/algolia/youtube-captions-scraper/issues"
"url": "https://github.com/tomByrer/youtube-captions-scraper"
},
"homepage": "https://github.com/tomByrer/youtube-captions-scraper",
"license": "MIT",
"scripts": {
"build": "rm -rf dist && babel src -d dist",
"prepublishOnly": "npm run build",
"lint": "eslint src",
"test": "ava",
"flow": "flow"
"demo": "npm run ./demos/dist-demo.js"
},
"dependencies": {
"axios": "^1.9.0"

Check warning on line 28 in package.json

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

package.json#L28

Package dependencies with variant versions may lead to dependency hijack and confusion attacks.
},
"devDependencies": {
"ava": "^0.25.0",
"babel-cli": "^6.26.0",
"babel-eslint": "^8.0.2",
"babel-preset-env": "^1.6.1",
"babel-preset-flow": "^6.23.0",
"babel-preset-env": "^1.7.0",

Check warning on line 32 in package.json

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

package.json#L32

Package dependencies with variant versions may lead to dependency hijack and confusion attacks.
"babel-preset-stage-0": "^6.24.1",
"babel-watch": "^2.0.7",
"eslint": "^4.11.0",
"eslint-config-algolia": "^12.0.0",
"eslint-config-prettier": "^2.8.0",
"eslint-plugin-flowtype": "^2.39.1",
"eslint-plugin-import": "^2.8.0",
"eslint-plugin-prettier": "^2.3.1",
"flow-bin": "^0.59.0",
"flow-typed": "^2.2.3",
"prettier": "^1.8.2"
},
"dependencies": {
"axios": "^0.17.1",
"he": "^1.1.1",
"lodash": "^4.17.4",
"striptags": "^3.1.0"
},
"ava": {
"babel": "inherit",
"require": [
"babel-register"
]
"babel-watch": "^7.8.1"
}
}
Loading