Skip to content
Open

zcv b #146

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions extension/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
"tabs",
"webNavigation"
],
"web_accessible_resources": [
"pdf-worker/pdf.worker.min.js"
],
"chrome_url_overrides" : {
"newtab": "/overview/overview.html"
},
Expand Down
5 changes: 5 additions & 0 deletions gulpfile.babel.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ const files = [
destination: './extension/options',
cssOutput: 'style.css',
},
{
entries: ['./src/pdf.worker.min.js'],
output: 'pdf.worker.min.js',
destination: './extension/pdf-worker',
},
]

const browserifySettings = {
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"lodash": "^4.17.4",
"moment": "^2.18.1",
"page-metadata-parser": "git://github.com/Treora/page-metadata-parser.git#npm-testable",
"pdfjs-dist": "^1.8.177",
"pouchdb-browser": "^6.1.0",
"pouchdb-find": "^0.10.4",
"pouchdb-quick-search": "^1.2.0",
Expand Down
24 changes: 10 additions & 14 deletions src/page-analysis/background/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,15 @@ import { revisePageFields } from '..'
import getFavIcon from './get-fav-icon'
import makeScreenshot from './make-screenshot'


// Extract interesting stuff from the current page and store it.
async function performPageAnalysis({pageId, tabId}) {
// Run these functions in the content script in the tab.
const extractPageText = remoteFunction('extractPageText', {tabId})
const extractPageMetadata = remoteFunction('extractPageMetadata', {tabId})
// Run this function in the content script in the tab.
const extractPageData = remoteFunction('extractPageData', {tabId})

// A shorthand for updating a single field in a doc.
const setDocField = (db, docId, key) =>
value => db.upsert(docId, doc => assocPath(key, value)(doc))

// Get page title, author (if any), etcetera.
const storePageMetadata = extractPageMetadata().then(
setDocField(db, pageId, 'extractedMetadata')
)

// Get and store the fav-icon
const storeFavIcon = getFavIcon({tabId}).then(
setDocField(db, pageId, 'favIcon')
Expand All @@ -36,15 +29,18 @@ async function performPageAnalysis({pageId, tabId}) {
setDocField(db, pageId, 'screenshot')
)

// Extract the main text
const storePageText = extractPageText().then(
setDocField(db, pageId, 'extractedText')
// Extract the Main text and Metadata
const storePageData = extractPageData().then(
(val) => {
console.log('Data : \n' + JSON.stringify(val, null, 2))
setDocField(db, pageId, 'extractedText')(val.pageText)
setDocField(db, pageId, 'extractedMetaData')(val.pageMetaData)
}
)

// When every task has either completed or failed, update the search index.
await whenAllSettled([
storePageMetadata,
storePageText,
storePageData,
storeFavIcon,
storeScreenshot,
])
Expand Down
34 changes: 34 additions & 0 deletions src/page-analysis/content_script/extract-page-data.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { getMetadata, metadataRules } from 'page-metadata-parser'
import extractPdfData from './extract-pdf-data'
import getText from './extract-page-data'

// Extract the 'main text' from web pages (esp. news article, blog post, ...) and PDFs.
async function extractPageDataSync({
// By default, use the globals window and document.
loc = window.location,
url = window.location.href,
doc = document,
} = {}) {
// Check URL for PDF
if (url.endsWith('.pdf')) {
return extractPdfData({url})
}

// Text content in web page
const pageText = getText(doc, loc)
// MetaData of web page
const pageMetadata = getMetadata(doc, url, metadataRules)

return {
pageText: pageText,
pageMetaData: pageMetadata,
}
}

// Wrap it in a promise.
export default function extractPageDataAsync(...args) {
return new Promise(function (resolve, reject) {
const run = () => resolve(extractPageDataSync(...args))
window.setTimeout(run, 0)
})
}
10 changes: 0 additions & 10 deletions src/page-analysis/content_script/extract-page-metadata.js

This file was deleted.

16 changes: 2 additions & 14 deletions src/page-analysis/content_script/extract-page-text.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
import Readability from 'readability'

// Extract the 'main text' from a web page (esp. news article, blog post, ...).
function extractPageTextSync({
// By default, use the globals window and document.
loc = window.location,
doc = document,
} = {}) {
export default function getText(doc, loc) {
const uri = {
spec: loc.href,
host: loc.host,
Expand All @@ -24,17 +19,10 @@ function extractPageTextSync({
// Bummer.
console.error('Readability (content extraction) crashed:', err)
}

return {
...article,
// Also return full text, as article may be empty or wrong.
bodyInnerText: doc.body.innerText,
}
}

// Wrap it in a promise.
export default function extractPageTextAsync(...args) {
return new Promise(function (resolve, reject) {
const run = () => resolve(extractPageTextSync(...args))
window.setTimeout(run, 0)
})
}
50 changes: 50 additions & 0 deletions src/page-analysis/content_script/extract-pdf-data.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Returns an Object containing PDF Text and MetaData
async function getDatafromBlob(blob) {
require('pdfjs-dist')

// workerSrc needs to be specified, PDFJS library uses
// Document.currentScript which is disallowed by content scripts
PDFJS.workerSrc = browser.extension.getURL('pdf-worker/pdf.worker.min.js')

// wait for document to load
const pdf = await PDFJS.getDocument(blob.target.result)

// [1..N] array for N pages
const pages = [...Array(pdf.pdfInfo.numPages + 1).keys()].slice(1)

// promises for page contents
const pageTextPromises = pages.map(async i => {
const page = await pdf.getPage(i)
// wait for object containing items array with text pieces
const pageItems = await page.getTextContent()
const pageText = pageItems.items.map(item => item.str).join(' ')
return pageText
})

// wait for all promises to be fulfilled
const pageTexts = await Promise.all(pageTextPromises)
const totalContent = pageTexts.join('\n')

// wait for metadata
const data = await pdf.getMetadata()

return {
pageText: { bodyInnerText: totalContent },
pageMetaData: data.info,
}
}

// Return promise for PDF data
export default async function extractPdfData({url, blob}) {
// fetch blob if not given
if (blob === undefined) {
const response = await fetch(url)
blob = await response.blob()
}

return new Promise(function (resolve, reject) {
const fileReader = new FileReader()
fileReader.onload = (blob) => resolve(getDatafromBlob(blob))
fileReader.readAsArrayBuffer(blob)
})
}
6 changes: 2 additions & 4 deletions src/page-analysis/content_script/index.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import { makeRemotelyCallable } from 'src/util/webextensionRPC'

import extractPageText from './extract-page-text'
import extractPageMetadata from './extract-page-metadata'
import extractPageData from './extract-page-data'

makeRemotelyCallable({
extractPageText,
extractPageMetadata,
extractPageData,
})
4 changes: 4 additions & 0 deletions src/page-analysis/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
export const searchableTextFields = [
'title',
'extractedMetadata.title',
'extractedMetadata.Title',
'extractedMetadata.Author',
'extractedMetadata.Subject',
'extractedMetadata.Keywords',
'extractedText.excerpt',
'extractedText.textContent',
'extractedText.bodyInnerText',
Expand Down
2 changes: 2 additions & 0 deletions src/pdf.worker.min.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// Shim file to import pdf.worker.min.js dependency
import 'pdfjs-dist/build/pdf.worker.min'