Skip to content

Fix issue 62 #63

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/docsplit/command_line.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,4 +116,4 @@ def parse_options

end

end
end
18 changes: 14 additions & 4 deletions lib/docsplit/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ class ImageExtractor
def extract(pdfs, options)
@pdfs = [pdfs].flatten
extract_options(options)
images = []
@pdfs.each do |pdf|
previous = nil
@sizes.each_with_index do |size, i|
@formats.each {|format| convert(pdf, size, format, previous) }
images += @formats.map {|format| convert(pdf, size, format, previous) }
previous = size if @rolling
end
end
return images.reject{|i| i.nil? or i.empty?}.flatten
end

# Convert a single PDF into page images at the specified size and format.
Expand All @@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil)
basename = File.basename(pdf, File.extname(pdf))
directory = directory_for(size)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
escaped_pdf = ESCAPE[pdf]
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
image_paths = []
if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
raise ExtractionFailed, result if $? != 0
if $? != 0
raise ExtractionFailed, result
end
else
page_list(pages).each do |page|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
if $? != 0
raise ExtractionFailed, result
else
image_paths << out_file
end
end
return image_paths
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
Expand Down
38 changes: 26 additions & 12 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,23 @@ def initialize
def extract(pdfs, opts)
extract_options opts
FileUtils.mkdir_p @output unless File.exists?(@output)
pdfs = Array(pdfs)
paths = []
[pdfs].flatten.each do |pdf|
@pdf_name = File.basename(pdf, File.extname(pdf))
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
extract_from_ocr(pdf, pages)
else
extract_from_pdf(pdf, pages)
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
extract_from_ocr(pdf, @pages_to_ocr)
end
end
return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
extract_from_ocr(pdf, pages)
else
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
extract_from_ocr(pdf, @pages_to_ocr)
else
extract_from_pdf(pdf, pages)
end
end
paths << return_value
end
return paths.flatten.compact
end

# Does a PDF have any text embedded?
Expand All @@ -52,31 +57,37 @@ def contains_text?(pdf)
# Extract a page range worth of text from a PDF, directly.
def extract_from_pdf(pdf, pages)
return extract_full(pdf) unless pages
pages.each {|page| extract_page(pdf, page) }
pages.map {|page| extract_page(pdf, page) }
end

# Extract a page range worth of text from a PDF via OCR.
def extract_from_ocr(pdf, pages)
tempdir = Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
escaped_pdf = ESCAPE[pdf]
paths = []
if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
clean_text(file + '.txt') if @clean_ocr
file_name = file + '.txt'
paths << file_name
clean_text(file_name) if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
file_name = base_path + '.txt'
paths << file_name
clean_text(file_name) if @clean_ocr
end
return paths
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end
Expand Down Expand Up @@ -104,16 +115,19 @@ def run(command)
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
return text_path
end

# Extract the contents of a single page of text, directly, adding it to
# the `@pages_to_ocr` list if the text length is inadequate.
def extract_page(pdf, page)
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"

unless @forbid_ocr
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
end
return text_path
end

def extract_options(options)
Expand All @@ -127,4 +141,4 @@ def extract_options(options)

end

end
end
13 changes: 13 additions & 0 deletions test/unit/test_extract_images.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ def test_image_formatting
assert Dir["#{OUTPUT}/*.jpg"].length == 2
end

def test_return_value
return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
assert return_value.length == 1
assert return_value.is_a?(Enumerable)
assert return_value.all?{|el| el =~ /\.gif/}
return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT)
assert return_value.length == 2
assert return_value.is_a?(Enumerable)
assert return_value.any?{|el| el =~ /\.gif/}
assert return_value.any?{|el| el =~ /\.jpg/}
end


def test_page_ranges
Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"]
Expand Down
15 changes: 12 additions & 3 deletions test/unit/test_extract_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
class ExtractTextTest < Test::Unit::TestCase

def test_paged_extraction
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
assert return_value.is_a?(Enumerable)
assert return_value.all?{|val| val =~ /\.txt/}
assert return_value.length == 2
end

def test_page_only_extraction
Expand All @@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction
end

def test_unicode_extraction
Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 3
assert return_value.is_a?(Enumerable)
assert return_value.all?{|val| val =~ /\.txt/}
assert return_value.length == 3
end

def test_ocr_extraction
Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
4.times do |i|
file = "corrosion_#{i + 1}.txt"
assert_directory_contains(OUTPUT, file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
end
assert return_value.is_a?(Enumerable)
assert return_value.all?(/\.txt/)
end


def test_ocr_extraction_in_mock_language
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
Expand Down