Skip to content

Commit 9789dd5

Browse files
hdermsDermot Haughey
authored and
Dermot Haughey
committed
Add test cases and make modification
Cleaning up code Flatten the return value Make it not add the path to the return value if an exception-worthy event occurred. Instead, merely raise that exception Make text_extractor also return paths to processed files Make function extract_images always return array of image paths Refine specs Fix tests Add nil check Refactor tests to better isolate functionality remove debugger remove logger Add printf debugging Sanity checking Printfs Remove puts Remove annoying line Cleanup Fix unnecessary usage of ternary operation to 'wrap' an Array and replaced with Array() as it is more idiomatic revert to original
1 parent 3d630b3 commit 9789dd5

File tree

5 files changed

+66
-20
lines changed

5 files changed

+66
-20
lines changed

lib/docsplit/command_line.rb

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,4 +116,4 @@ def parse_options
116116

117117
end
118118

119-
end
119+
end

lib/docsplit/image_extractor.rb

+14-4
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@ class ImageExtractor
1313
def extract(pdfs, options)
1414
@pdfs = [pdfs].flatten
1515
extract_options(options)
16+
images = []
1617
@pdfs.each do |pdf|
1718
previous = nil
1819
@sizes.each_with_index do |size, i|
19-
@formats.each {|format| convert(pdf, size, format, previous) }
20+
images += @formats.map {|format| convert(pdf, size, format, previous) }
2021
previous = size if @rolling
2122
end
2223
end
24+
return images.reject{|i| i.nil? or i.empty?}.flatten
2325
end
2426

2527
# Convert a single PDF into page images at the specified size and format.
@@ -32,20 +34,28 @@ def convert(pdf, size, format, previous=nil)
3234
basename = File.basename(pdf, File.extname(pdf))
3335
directory = directory_for(size)
3436
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35-
escaped_pdf = ESCAPE[pdf]
37+
escaped_pdf = ESCAPE[pdf]
3638
FileUtils.mkdir_p(directory) unless File.exists?(directory)
3739
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
40+
image_paths = []
3841
if previous
3942
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
4043
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
41-
raise ExtractionFailed, result if $? != 0
44+
if $? != 0
45+
raise ExtractionFailed, result
46+
end
4247
else
4348
page_list(pages).each do |page|
4449
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
4550
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
4651
result = `#{cmd}`.chomp
47-
raise ExtractionFailed, result if $? != 0
52+
if $? != 0
53+
raise ExtractionFailed, result
54+
else
55+
image_paths << out_file
56+
end
4857
end
58+
return image_paths
4959
end
5060
ensure
5161
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)

lib/docsplit/text_extractor.rb

+26-12
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,23 @@ def initialize
2929
def extract(pdfs, opts)
3030
extract_options opts
3131
FileUtils.mkdir_p @output unless File.exists?(@output)
32+
pdfs = Array(pdfs)
33+
paths = []
3234
[pdfs].flatten.each do |pdf|
3335
@pdf_name = File.basename(pdf, File.extname(pdf))
3436
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
35-
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
36-
extract_from_ocr(pdf, pages)
37-
else
38-
extract_from_pdf(pdf, pages)
39-
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
40-
extract_from_ocr(pdf, @pages_to_ocr)
41-
end
42-
end
37+
return_value = if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
38+
extract_from_ocr(pdf, pages)
39+
else
40+
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
41+
extract_from_ocr(pdf, @pages_to_ocr)
42+
else
43+
extract_from_pdf(pdf, pages)
44+
end
45+
end
46+
paths << return_value
4347
end
48+
return paths.flatten.compact
4449
end
4550

4651
# Does a PDF have any text embedded?
@@ -52,31 +57,37 @@ def contains_text?(pdf)
5257
# Extract a page range worth of text from a PDF, directly.
5358
def extract_from_pdf(pdf, pages)
5459
return extract_full(pdf) unless pages
55-
pages.each {|page| extract_page(pdf, page) }
60+
pages.map {|page| extract_page(pdf, page) }
5661
end
5762

5863
# Extract a page range worth of text from a PDF via OCR.
5964
def extract_from_ocr(pdf, pages)
6065
tempdir = Dir.mktmpdir
6166
base_path = File.join(@output, @pdf_name)
6267
escaped_pdf = ESCAPE[pdf]
68+
paths = []
6369
if pages
6470
pages.each do |page|
6571
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
6672
escaped_tiff = ESCAPE[tiff]
6773
file = "#{base_path}_#{page}"
6874
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
6975
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
70-
clean_text(file + '.txt') if @clean_ocr
76+
file_name = file + '.txt'
77+
paths << file_name
78+
clean_text(file_name) if @clean_ocr
7179
FileUtils.remove_entry_secure tiff
7280
end
7381
else
7482
tiff = "#{tempdir}/#{@pdf_name}.tif"
7583
escaped_tiff = ESCAPE[tiff]
7684
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
7785
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
78-
clean_text(base_path + '.txt') if @clean_ocr
86+
file_name = base_path + '.txt'
87+
paths << file_name
88+
clean_text(file_name) if @clean_ocr
7989
end
90+
return paths
8091
ensure
8192
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
8293
end
@@ -104,16 +115,19 @@ def run(command)
104115
def extract_full(pdf)
105116
text_path = File.join(@output, "#{@pdf_name}.txt")
106117
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
118+
return text_path
107119
end
108120

109121
# Extract the contents of a single page of text, directly, adding it to
110122
# the `@pages_to_ocr` list if the text length is inadequate.
111123
def extract_page(pdf, page)
112124
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
113125
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
126+
114127
unless @forbid_ocr
115128
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
116129
end
130+
return text_path
117131
end
118132

119133
def extract_options(options)
@@ -127,4 +141,4 @@ def extract_options(options)
127141

128142
end
129143

130-
end
144+
end

test/unit/test_extract_images.rb

+13
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@ def test_image_formatting
1313
assert Dir["#{OUTPUT}/*.jpg"].length == 2
1414
end
1515

16+
def test_return_value
17+
return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
18+
assert return_value.length == 1
19+
assert return_value.is_a?(Enumerable)
20+
assert return_value.all?{|el| el =~ /\.gif/}
21+
return_value = Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => [:jpg, :gif], :size => "50x", :pages => 2, :output => OUTPUT)
22+
assert return_value.length == 2
23+
assert return_value.is_a?(Enumerable)
24+
assert return_value.any?{|el| el =~ /\.gif/}
25+
assert return_value.any?{|el| el =~ /\.jpg/}
26+
end
27+
28+
1629
def test_page_ranges
1730
Docsplit.extract_images('test/fixtures/obama_arts.pdf', :format => :gif, :size => "50x", :pages => 2, :output => OUTPUT)
1831
assert Dir["#{OUTPUT}/*.gif"] == ["#{OUTPUT}/obama_arts_2.gif"]

test/unit/test_extract_text.rb

+12-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
class ExtractTextTest < Test::Unit::TestCase
55

66
def test_paged_extraction
7-
Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
7+
return_value = Docsplit.extract_text('test/fixtures/obama_arts.pdf', :pages => 'all', :output => OUTPUT)
88
assert Dir["#{OUTPUT}/*.txt"].length == 2
99
assert File.read("#{OUTPUT}/obama_arts_1.txt").match("Paid for by Obama for America")
10+
assert return_value.is_a?(Enumerable)
11+
assert return_value.all?{|val| val =~ /\.txt/}
12+
assert return_value.length == 2
1013
end
1114

1215
def test_page_only_extraction
@@ -24,19 +27,25 @@ def test_capitalized_pdf_extraction
2427
end
2528

2629
def test_unicode_extraction
27-
Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
30+
return_value = Docsplit.extract_text('test/fixtures/unicode.pdf', :pages => 'all', :output => OUTPUT)
2831
assert Dir["#{OUTPUT}/*.txt"].length == 3
32+
assert return_value.is_a?(Enumerable)
33+
assert return_value.all?{|val| val =~ /\.txt/}
34+
assert return_value.length == 3
2935
end
3036

3137
def test_ocr_extraction
32-
Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
38+
return_value =Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT)
3339
4.times do |i|
3440
file = "corrosion_#{i + 1}.txt"
3541
assert_directory_contains(OUTPUT, file)
3642
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
3743
end
44+
assert return_value.is_a?(Enumerable)
45+
assert return_value.all?(/\.txt/)
3846
end
3947

48+
4049
def test_ocr_extraction_in_mock_language
4150
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
4251
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"

0 commit comments

Comments
 (0)