From 1f1ec939179d24407d0ea04824f95cad58017f15 Mon Sep 17 00:00:00 2001
From: Nate Todd <nate@pixelauthorityllc.com>
Date: Thu, 18 Dec 2014 17:20:40 -0500
Subject: [PATCH 1/2] Add parallel processing to OCR text extraction

Use GNU Parallel if installed to parallelize tesseract OCR on full document text extraction.  If Parallel is not installed, use previous behavior.
---
 lib/docsplit.rb                |  8 ++++----
 lib/docsplit/text_extractor.rb | 18 +++++++++++-------
 2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 1c49e91..dad9d73 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -13,11 +13,11 @@ module Docsplit
   ESCAPED_ROOT  = ESCAPE[ROOT]
 
   METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
-  
-  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 
-  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false}
+  GM_FORMATS    = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"]
 
+  DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false, :parallel => false}
+  
   # Check for all dependencies, and note their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
   DEPENDENCIES.each_key do |dep|
@@ -75,7 +75,7 @@ def self.extract_#{key}(pdfs, opts={})
       end
     EOS
   end
-  
+
   def self.extract_info(pdfs, opts={})
     pdfs = ensure_pdfs(pdfs)
     InfoExtractor.new.extract_all(pdfs, opts)
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 985abdd..e74d86a 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -72,18 +72,22 @@ def extract_from_ocr(pdf, pages)
           FileUtils.remove_entry_secure tiff
         end
       else
-        tiff = "#{tempdir}/#{@pdf_name}.tif"
-        escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
-        clean_text(base_path + '.txt') if @clean_ocr
+        if DEPENDENCIES[:parallel]
+          run "MAGICK_TMPDIR=#{tempdir} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{tempdir}/page_%d.tiff 2>&1"
+          run "parallel tesseract -l #{@language} #{psm} {} {.} ::: #{tempdir}/page_*.tiff 2>&1"
+          run "cat #{tempdir}/page_*.txt >'#{base_path}.txt' 2>&1"
+        else
+          tiff = "#{tempdir}/#{@pdf_name}.tif"
+          escaped_tiff = ESCAPE[tiff]
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+          run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
+        end
+        clean_text("#{base_path}.txt") if @clean_ocr
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
 
-
     private
 
     def clean_text(file)

From 7427d08e916bbfedc0bcb3c4f502080933b9af91 Mon Sep 17 00:00:00 2001
From: Nate Todd <nate@pixelauthorityllc.com>
Date: Thu, 18 Dec 2014 17:43:24 -0500
Subject: [PATCH 2/2] Add Parallel installation to documentation

---
 index.html | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/index.html b/index.html
index be419ac..97f415e 100755
--- a/index.html
+++ b/index.html
@@ -159,6 +159,11 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         <tt>aptitude install libreoffice</tt><br />
         On the Mac, download and install <a href="http://www.libreoffice.org/download">the latest release</a>.
       </li>
+      <li>
+        (Optional) Install <a href="http://www.gnu.org/software/parallel/">Parallel</a>:<br>
+        <tt>[aptitude | port | brew] install parallel</tt><br>
+        Parallel speeds up OCR text extraction of documents by processing pages in parallel.
+      </li>
     </ol>
 
     <p><i>