diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 8d48500..aa6e7ce 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -116,4 +116,4 @@ def parse_options end -end \ No newline at end of file +end diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..f6ef086 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -13,13 +13,15 @@ class ImageExtractor def extract(pdfs, options) @pdfs = [pdfs].flatten extract_options(options) + images = [] @pdfs.each do |pdf| previous = nil @sizes.each_with_index do |size, i| - @formats.each {|format| convert(pdf, size, format, previous) } + images += @formats.map {|format| convert(pdf, size, format, previous) } previous = size if @rolling end end + return images.reject{|i| i.nil? or i.empty?}.flatten end # Convert a single PDF into page images at the specified size and format. @@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil) basename = File.basename(pdf, File.extname(pdf)) directory = directory_for(size) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s - escaped_pdf = ESCAPE[pdf] + escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" + image_paths = [] if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp - raise ExtractionFailed, result if $? != 0 + if $? != 0 + raise ExtractionFailed, result + end else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp - raise ExtractionFailed, result if $? != 0 + if $? != 0 + raise ExtractionFailed, result + else + image_paths << out_file + end end + return image_paths end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..bcb2ba1 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -29,18 +29,23 @@ def initialize def extract(pdfs, opts) extract_options opts FileUtils.mkdir_p @output unless File.exists?(@output) + pdfs = Array(pdfs) + paths = [] [pdfs].flatten.each do |pdf| @pdf_name = File.basename(pdf, File.extname(pdf)) pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages - if @force_ocr || (!@forbid_ocr && !contains_text?(pdf)) - extract_from_ocr(pdf, pages) - else - extract_from_pdf(pdf, pages) - if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty? - extract_from_ocr(pdf, @pages_to_ocr) - end - end + return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf)) + extract_from_ocr(pdf, pages) + else + if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty? + extract_from_ocr(pdf, @pages_to_ocr) + else + extract_from_pdf(pdf, pages) + end + end + paths << return_value end + return paths.flatten.compact end # Does a PDF have any text embedded? @@ -52,7 +57,7 @@ def contains_text?(pdf) # Extract a page range worth of text from a PDF, directly. def extract_from_pdf(pdf, pages) return extract_full(pdf) unless pages - pages.each {|page| extract_page(pdf, page) } + pages.map {|page| extract_page(pdf, page) } end # Extract a page range worth of text from a PDF via OCR. @@ -60,6 +65,7 @@ def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] + paths = [] if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" @@ -67,7 +73,9 @@ def extract_from_ocr(pdf, pages) file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" - clean_text(file + '.txt') if @clean_ocr + file_name = file + '.txt' + paths << file_name + clean_text(file_name) if @clean_ocr FileUtils.remove_entry_secure tiff end else @@ -75,8 +83,11 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" - clean_text(base_path + '.txt') if @clean_ocr + file_name = base_path + '.txt' + paths << file_name + clean_text(file_name) if @clean_ocr end + return paths ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end @@ -104,6 +115,7 @@ def run(command) def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + return text_path end # Extract the contents of a single page of text, directly, adding it to @@ -111,9 +123,11 @@ def extract_full(pdf) def extract_page(pdf, page) text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" + unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE end + return text_path end def extract_options(options) @@ -127,4 +141,4 @@ def extract_options(options) end -end \ No newline at end of file +end diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb index 08c0b52..65647c9 100755 --- a/test/unit/test_extract_images.rb +++ b/test/unit/test_extract_images.rb @@ -13,6 +13,19 @@ def test_image_formatting assert Dir["#{OUTPUT}/*.jpg"].length == 2 end + def test_return_value + return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) + assert return_value.length == 1 + assert return_value.is_a?(Enumerable) + assert return_value.all?{|el| el =~ /\.gif/} + return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT) + assert return_value.length == 2 + assert return_value.is_a?(Enumerable) + assert return_value.any?{|el| el =~ /\.gif/} + assert return_value.any?{|el| el =~ /\.jpg/} + end + + def test_page_ranges Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT) assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"] diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 69ccb5a..ffb089e 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -4,9 +4,12 @@ class ExtractTextTest < Test::Unit::TestCase def test_paged_extraction - Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT) + return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 2 assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America") + assert return_value.is_a?(Enumerable) + assert return_value.all?{|val| val =~ /\.txt/} + assert return_value.length == 2 end def test_page_only_extraction @@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction end def test_unicode_extraction - Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT) + return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 3 + assert return_value.is_a?(Enumerable) + assert return_value.all?{|val| val =~ /\.txt/} + assert return_value.length == 3 end def test_ocr_extraction - Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT) + return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT) 4.times do |i| file = "corrosion_#{i + 1}.txt" assert_directory_contains(OUTPUT, file) assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size" end + assert return_value.is_a?(Enumerable) + assert return_value.all?(/\.txt/) end + def test_ocr_extraction_in_mock_language exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")} assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"