From 1f1ec939179d24407d0ea04824f95cad58017f15 Mon Sep 17 00:00:00 2001 From: Nate Todd Date: Thu, 18 Dec 2014 17:20:40 -0500 Subject: [PATCH 1/2] Add parallel processing to OCR text extraction Use GNU Parallel if installed to parallelize tesseract OCR on full document text extraction. If Parallel is not installed, use previous behavior. --- lib/docsplit.rb | 8 ++++---- lib/docsplit/text_extractor.rb | 18 +++++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..dad9d73 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -13,11 +13,11 @@ module Docsplit ESCAPED_ROOT = ESCAPE[ROOT] METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length] - - GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"] - DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false} + GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"] + DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false, :parallel => false} + # Check for all dependencies, and note their absence. dirs = ENV['PATH'].split(File::PATH_SEPARATOR) DEPENDENCIES.each_key do |dep| @@ -75,7 +75,7 @@ def self.extract_#{key}(pdfs, opts={}) end EOS end - + def self.extract_info(pdfs, opts={}) pdfs = ensure_pdfs(pdfs) InfoExtractor.new.extract_all(pdfs, opts) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 985abdd..e74d86a 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -72,18 +72,22 @@ def extract_from_ocr(pdf, pages) FileUtils.remove_entry_secure tiff end else - tiff = "#{tempdir}/#{@pdf_name}.tif" - escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" - #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" - clean_text(base_path + '.txt') if @clean_ocr + if DEPENDENCIES[:parallel] + run "MAGICK_TMPDIR=#{tempdir} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{tempdir}/page_%d.tiff 2>&1" + run "parallel tesseract -l #{@language} #{psm} {} {.} ::: #{tempdir}/page_*.tiff 2>&1" + run "cat #{tempdir}/page_*.txt >'#{base_path}.txt' 2>&1" + else + tiff = "#{tempdir}/#{@pdf_name}.tif" + escaped_tiff = ESCAPE[tiff] + run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" + end + clean_text("#{base_path}.txt") if @clean_ocr end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end - private def clean_text(file) From 7427d08e916bbfedc0bcb3c4f502080933b9af91 Mon Sep 17 00:00:00 2001 From: Nate Todd Date: Thu, 18 Dec 2014 17:43:24 -0500 Subject: [PATCH 2/2] Add Parallel installation to documentation --- index.html | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/index.html b/index.html index be419ac..97f415e 100755 --- a/index.html +++ b/index.html @@ -159,6 +159,11 @@

Installation & Dependencies

aptitude install libreoffice
On the Mac, download and install the latest release. +
  • + (Optional) Install Parallel:
    + [aptitude | port | brew] install parallel
    + Parallel speeds up OCR text extraction of documents by processing pages in parallel. +