diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..eadc96b 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -16,7 +16,16 @@ module Docsplit GM_FORMATS = ["image/gif", "image/jpeg", "image/png", "image/x-ms-bmp", "image/svg+xml", "image/tiff", "image/x-portable-bitmap", "application/postscript", "image/x-portable-pixmap"] - DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :pdftailor => false, :tesseract => false, :osd => false} + DEPENDENCIES = { + :java => false, + :gm => false, + :qpdf => false, + :pdftotext => false, + :pdftk => false, + :pdftailor => false, + :tesseract => false, + :osd => false + } # Check for all dependencies, and note their absence. dirs = ENV['PATH'].split(File::PATH_SEPARATOR) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..8c1dc4d 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -33,22 +33,22 @@ def convert(pdf, size, format, previous=nil) directory = directory_for(size) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s escaped_pdf = ESCAPE[pdf] - FileUtils.mkdir_p(directory) unless File.exists?(directory) + FileUtils.mkdir_p(directory) unless File.exist?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp + result = `MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp + cmd = "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 end end ensure - FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) + FileUtils.remove_entry_secure tempdir if File.exist?(tempdir) end diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb index 145c980..6d5d959 100644 --- a/lib/docsplit/page_extractor.rb +++ b/lib/docsplit/page_extractor.rb @@ -10,15 +10,17 @@ def extract(pdfs, opts) [pdfs].flatten.each do |pdf| pdf_name = File.basename(pdf, File.extname(pdf)) page_path = ESCAPE[File.join(@output, "#{pdf_name}")] + "_%d.pdf" - FileUtils.mkdir_p @output unless File.exists?(@output) + FileUtils.mkdir_p @output unless File.exist?(@output) - cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability + cmd = if DEPENDENCIES[:qpdf] # prefer qpdf, but keep pdftk for backwards compatability + "qpdf --split-pages #{ESCAPE[pdf]} #{page_path} 2>&1" + elsif DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability "pdftailor unstitch --output #{page_path} #{ESCAPE[pdf]} 2>&1" else "pdftk #{ESCAPE[pdf]} burst output #{page_path} 2>&1" end result = `#{cmd}`.chomp - FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt') + FileUtils.rm('doc_data.txt') if File.exist?('doc_data.txt') raise ExtractionFailed, result if $? != 0 result end diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb index 21861e2..86ce98b 100644 --- a/lib/docsplit/pdf_extractor.rb +++ b/lib/docsplit/pdf_extractor.rb @@ -78,7 +78,7 @@ def office_executable # raise an error if that path isn't valid, otherwise, add # it to the front of our search paths. if ENV['OFFICE_PATH'] - raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH'] + raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH'] paths.unshift(ENV['OFFICE_PATH']) end @@ -95,11 +95,11 @@ def office_executable # Search for the first suitable office executable # and short circuit an executable is found. paths.each do |path| - if File.exists? path + if File.exist? path @@executable ||= path unless File.directory? path path_pieces.each do |pieces| check_path = File.join(path, pieces) - @@executable ||= check_path if File.exists? check_path + @@executable ||= check_path if File.exist? check_path end end break if @@executable @@ -116,7 +116,7 @@ def office_path # Convert documents to PDF. def extract(docs, opts) out = opts[:output] || '.' - FileUtils.mkdir_p out unless File.exists?(out) + FileUtils.mkdir_p out unless File.exist?(out) [docs].flatten.each do |doc| ext = File.extname(doc) basename = File.basename(doc, ext) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 985abdd..d03e3e7 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -21,6 +21,8 @@ class TextExtractor MIN_TEXT_PER_PAGE = 100 # in bytes + TIMEOUT = '5m' + def initialize @pages_to_ocr = [] end @@ -28,7 +30,7 @@ def initialize # Extract text from a list of PDFs. def extract(pdfs, opts) extract_options opts - FileUtils.mkdir_p @output unless File.exists?(@output) + FileUtils.mkdir_p @output unless File.exist?(@output) [pdfs].flatten.each do |pdf| @pdf_name = File.basename(pdf, File.extname(pdf)) pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages @@ -46,7 +48,7 @@ def extract(pdfs, opts) # Does a PDF have any text embedded? def contains_text?(pdf) fonts = `pdffonts #{ESCAPE[pdf]} 2>&1` - !fonts.match(NO_TEXT_DETECTED) + !fonts.scrub.match(NO_TEXT_DETECTED) end # Extract a page range worth of text from a PDF, directly. @@ -66,7 +68,8 @@ def extract_from_ocr(pdf, pages) tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" + run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" + raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff @@ -74,13 +77,14 @@ def extract_from_ocr(pdf, pages) else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run "MAGICK_TEMPORARY_PATH=#{tempdir} OMP_NUM_THREADS=2 timeout #{TIMEOUT} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 + raise Docsplit::ExtractionFailed unless File.exist? escaped_tiff run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure - FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) + FileUtils.remove_entry_secure tempdir if File.exist?(tempdir) end