diff --git a/extension/manifest.json b/extension/manifest.json index 7fc6c806e..9775f8529 100644 --- a/extension/manifest.json +++ b/extension/manifest.json @@ -31,6 +31,9 @@ "tabs", "webNavigation" ], + "web_accessible_resources": [ + "pdf-worker/pdf.worker.min.js" + ], "chrome_url_overrides" : { "newtab": "/overview/overview.html" }, diff --git a/gulpfile.babel.js b/gulpfile.babel.js index 14a71637a..c775466a1 100644 --- a/gulpfile.babel.js +++ b/gulpfile.babel.js @@ -35,6 +35,11 @@ const files = [ destination: './extension/options', cssOutput: 'style.css', }, + { + entries: ['./src/pdf.worker.min.js'], + output: 'pdf.worker.min.js', + destination: './extension/pdf-worker', + }, ] const browserifySettings = { diff --git a/package.json b/package.json index 560a793a5..7e26832cd 100644 --- a/package.json +++ b/package.json @@ -24,6 +24,7 @@ "lodash": "^4.17.4", "moment": "^2.18.1", "page-metadata-parser": "git://github.com/Treora/page-metadata-parser.git#npm-testable", + "pdfjs-dist": "^1.8.177", "pouchdb-browser": "^6.1.0", "pouchdb-find": "^0.10.4", "pouchdb-quick-search": "^1.2.0", diff --git a/src/page-analysis/background/index.js b/src/page-analysis/background/index.js index cf119447f..bc1bca114 100644 --- a/src/page-analysis/background/index.js +++ b/src/page-analysis/background/index.js @@ -10,22 +10,15 @@ import { revisePageFields } from '..' import getFavIcon from './get-fav-icon' import makeScreenshot from './make-screenshot' - // Extract interesting stuff from the current page and store it. async function performPageAnalysis({pageId, tabId}) { - // Run these functions in the content script in the tab. - const extractPageText = remoteFunction('extractPageText', {tabId}) - const extractPageMetadata = remoteFunction('extractPageMetadata', {tabId}) + // Run this function in the content script in the tab. + const extractPageData = remoteFunction('extractPageData', {tabId}) // A shorthand for updating a single field in a doc. const setDocField = (db, docId, key) => value => db.upsert(docId, doc => assocPath(key, value)(doc)) - // Get page title, author (if any), etcetera. - const storePageMetadata = extractPageMetadata().then( - setDocField(db, pageId, 'extractedMetadata') - ) - // Get and store the fav-icon const storeFavIcon = getFavIcon({tabId}).then( setDocField(db, pageId, 'favIcon') @@ -36,15 +29,18 @@ async function performPageAnalysis({pageId, tabId}) { setDocField(db, pageId, 'screenshot') ) - // Extract the main text - const storePageText = extractPageText().then( - setDocField(db, pageId, 'extractedText') + // Extract the Main text and Metadata + const storePageData = extractPageData().then( + (val) => { + console.log('Data : \n' + JSON.stringify(val, null, 2)) + setDocField(db, pageId, 'extractedText')(val.pageText) + setDocField(db, pageId, 'extractedMetaData')(val.pageMetaData) + } ) // When every task has either completed or failed, update the search index. await whenAllSettled([ - storePageMetadata, - storePageText, + storePageData, storeFavIcon, storeScreenshot, ]) diff --git a/src/page-analysis/content_script/extract-page-data.js b/src/page-analysis/content_script/extract-page-data.js new file mode 100644 index 000000000..df11083a3 --- /dev/null +++ b/src/page-analysis/content_script/extract-page-data.js @@ -0,0 +1,34 @@ +import { getMetadata, metadataRules } from 'page-metadata-parser' +import extractPdfData from './extract-pdf-data' +import getText from './extract-page-data' + +// Extract the 'main text' from web pages (esp. news article, blog post, ...) and PDFs. +async function extractPageDataSync({ + // By default, use the globals window and document. + loc = window.location, + url = window.location.href, + doc = document, +} = {}) { + // Check URL for PDF + if (url.endsWith('.pdf')) { + return extractPdfData({url}) + } + + // Text content in web page + const pageText = getText(doc, loc) + // MetaData of web page + const pageMetadata = getMetadata(doc, url, metadataRules) + + return { + pageText: pageText, + pageMetaData: pageMetadata, + } +} + +// Wrap it in a promise. +export default function extractPageDataAsync(...args) { + return new Promise(function (resolve, reject) { + const run = () => resolve(extractPageDataSync(...args)) + window.setTimeout(run, 0) + }) +} diff --git a/src/page-analysis/content_script/extract-page-metadata.js b/src/page-analysis/content_script/extract-page-metadata.js deleted file mode 100644 index 505a7ebeb..000000000 --- a/src/page-analysis/content_script/extract-page-metadata.js +++ /dev/null @@ -1,10 +0,0 @@ -import { getMetadata, metadataRules } from 'page-metadata-parser' - -// Extract info from all sorts of meta tags (og, twitter, etc.) -export default function extractPageMetadata({ - doc = document, - url = window.location.href, -} = {}) { - const pageMetadata = getMetadata(doc, url, metadataRules) - return pageMetadata -} diff --git a/src/page-analysis/content_script/extract-page-text.js b/src/page-analysis/content_script/extract-page-text.js index e35b0f609..6953ae3b8 100644 --- a/src/page-analysis/content_script/extract-page-text.js +++ b/src/page-analysis/content_script/extract-page-text.js @@ -1,11 +1,6 @@ import Readability from 'readability' -// Extract the 'main text' from a web page (esp. news article, blog post, ...). -function extractPageTextSync({ - // By default, use the globals window and document. - loc = window.location, - doc = document, -} = {}) { +export default function getText(doc, loc) { const uri = { spec: loc.href, host: loc.host, @@ -24,17 +19,10 @@ function extractPageTextSync({ // Bummer. console.error('Readability (content extraction) crashed:', err) } + return { ...article, // Also return full text, as article may be empty or wrong. bodyInnerText: doc.body.innerText, } } - -// Wrap it in a promise. -export default function extractPageTextAsync(...args) { - return new Promise(function (resolve, reject) { - const run = () => resolve(extractPageTextSync(...args)) - window.setTimeout(run, 0) - }) -} diff --git a/src/page-analysis/content_script/extract-pdf-data.js b/src/page-analysis/content_script/extract-pdf-data.js new file mode 100644 index 000000000..ca5b19f8b --- /dev/null +++ b/src/page-analysis/content_script/extract-pdf-data.js @@ -0,0 +1,50 @@ +// Returns an Object containing PDF Text and MetaData +async function getDatafromBlob(blob) { + require('pdfjs-dist') + + // workerSrc needs to be specified, PDFJS library uses + // Document.currentScript which is disallowed by content scripts + PDFJS.workerSrc = browser.extension.getURL('pdf-worker/pdf.worker.min.js') + + // wait for document to load + const pdf = await PDFJS.getDocument(blob.target.result) + + // [1..N] array for N pages + const pages = [...Array(pdf.pdfInfo.numPages + 1).keys()].slice(1) + + // promises for page contents + const pageTextPromises = pages.map(async i => { + const page = await pdf.getPage(i) + // wait for object containing items array with text pieces + const pageItems = await page.getTextContent() + const pageText = pageItems.items.map(item => item.str).join(' ') + return pageText + }) + + // wait for all promises to be fulfilled + const pageTexts = await Promise.all(pageTextPromises) + const totalContent = pageTexts.join('\n') + + // wait for metadata + const data = await pdf.getMetadata() + + return { + pageText: { bodyInnerText: totalContent }, + pageMetaData: data.info, + } +} + +// Return promise for PDF data +export default async function extractPdfData({url, blob}) { + // fetch blob if not given + if (blob === undefined) { + const response = await fetch(url) + blob = await response.blob() + } + + return new Promise(function (resolve, reject) { + const fileReader = new FileReader() + fileReader.onload = (blob) => resolve(getDatafromBlob(blob)) + fileReader.readAsArrayBuffer(blob) + }) +} diff --git a/src/page-analysis/content_script/index.js b/src/page-analysis/content_script/index.js index 5f6f94d06..03bd746d6 100644 --- a/src/page-analysis/content_script/index.js +++ b/src/page-analysis/content_script/index.js @@ -1,9 +1,7 @@ import { makeRemotelyCallable } from 'src/util/webextensionRPC' -import extractPageText from './extract-page-text' -import extractPageMetadata from './extract-page-metadata' +import extractPageData from './extract-page-data' makeRemotelyCallable({ - extractPageText, - extractPageMetadata, + extractPageData, }) diff --git a/src/page-analysis/index.js b/src/page-analysis/index.js index a606468d3..66a1f8c4d 100644 --- a/src/page-analysis/index.js +++ b/src/page-analysis/index.js @@ -4,6 +4,10 @@ export const searchableTextFields = [ 'title', 'extractedMetadata.title', + 'extractedMetadata.Title', + 'extractedMetadata.Author', + 'extractedMetadata.Subject', + 'extractedMetadata.Keywords', 'extractedText.excerpt', 'extractedText.textContent', 'extractedText.bodyInnerText', diff --git a/src/pdf.worker.min.js b/src/pdf.worker.min.js new file mode 100644 index 000000000..6581287ff --- /dev/null +++ b/src/pdf.worker.min.js @@ -0,0 +1,2 @@ +// Shim file to import pdf.worker.min.js dependency +import 'pdfjs-dist/build/pdf.worker.min'