@@ -29,18 +29,23 @@ def initialize
29
29
def extract ( pdfs , opts )
30
30
extract_options opts
31
31
FileUtils . mkdir_p @output unless File . exists? ( @output )
32
+ pdfs = Array ( pdfs )
33
+ paths = [ ]
32
34
[ pdfs ] . flatten . each do |pdf |
33
35
@pdf_name = File . basename ( pdf , File . extname ( pdf ) )
34
36
pages = ( @pages == 'all' ) ? 1 ..Docsplit . extract_length ( pdf ) : @pages
35
- if @force_ocr || ( !@forbid_ocr && !contains_text? ( pdf ) )
36
- extract_from_ocr ( pdf , pages )
37
- else
38
- extract_from_pdf ( pdf , pages )
39
- if !@forbid_ocr && DEPENDENCIES [ :tesseract ] && !@pages_to_ocr . empty?
40
- extract_from_ocr ( pdf , @pages_to_ocr )
41
- end
42
- end
37
+ return_value = if @force_ocr || ( !@forbid_ocr && !contains_text? ( pdf ) )
38
+ extract_from_ocr ( pdf , pages )
39
+ else
40
+ if !@forbid_ocr && DEPENDENCIES [ :tesseract ] && !@pages_to_ocr . empty?
41
+ extract_from_ocr ( pdf , @pages_to_ocr )
42
+ else
43
+ extract_from_pdf ( pdf , pages )
44
+ end
45
+ end
46
+ paths << return_value
43
47
end
48
+ return paths . flatten . compact
44
49
end
45
50
46
51
# Does a PDF have any text embedded?
@@ -52,31 +57,37 @@ def contains_text?(pdf)
52
57
# Extract a page range worth of text from a PDF, directly.
53
58
def extract_from_pdf ( pdf , pages )
54
59
return extract_full ( pdf ) unless pages
55
- pages . each { |page | extract_page ( pdf , page ) }
60
+ pages . map { |page | extract_page ( pdf , page ) }
56
61
end
57
62
58
63
# Extract a page range worth of text from a PDF via OCR.
59
64
def extract_from_ocr ( pdf , pages )
60
65
tempdir = Dir . mktmpdir
61
66
base_path = File . join ( @output , @pdf_name )
62
67
escaped_pdf = ESCAPE [ pdf ]
68
+ paths = [ ]
63
69
if pages
64
70
pages . each do |page |
65
71
tiff = "#{ tempdir } /#{ @pdf_name } _#{ page } .tif"
66
72
escaped_tiff = ESCAPE [ tiff ]
67
73
file = "#{ base_path } _#{ page } "
68
74
run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ escaped_pdf } [#{ page - 1 } ] #{ escaped_tiff } 2>&1"
69
75
run "tesseract #{ escaped_tiff } #{ ESCAPE [ file ] } -l #{ @language } 2>&1"
70
- clean_text ( file + '.txt' ) if @clean_ocr
76
+ file_name = file + '.txt'
77
+ paths << file_name
78
+ clean_text ( file_name ) if @clean_ocr
71
79
FileUtils . remove_entry_secure tiff
72
80
end
73
81
else
74
82
tiff = "#{ tempdir } /#{ @pdf_name } .tif"
75
83
escaped_tiff = ESCAPE [ tiff ]
76
84
run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert -despeckle #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ escaped_pdf } #{ escaped_tiff } 2>&1"
77
85
run "tesseract #{ escaped_tiff } #{ base_path } -l #{ @language } 2>&1"
78
- clean_text ( base_path + '.txt' ) if @clean_ocr
86
+ file_name = base_path + '.txt'
87
+ paths << file_name
88
+ clean_text ( file_name ) if @clean_ocr
79
89
end
90
+ return paths
80
91
ensure
81
92
FileUtils . remove_entry_secure tempdir if File . exists? ( tempdir )
82
93
end
@@ -104,16 +115,19 @@ def run(command)
104
115
def extract_full ( pdf )
105
116
text_path = File . join ( @output , "#{ @pdf_name } .txt" )
106
117
run "pdftotext -enc UTF-8 #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
118
+ return text_path
107
119
end
108
120
109
121
# Extract the contents of a single page of text, directly, adding it to
110
122
# the `@pages_to_ocr` list if the text length is inadequate.
111
123
def extract_page ( pdf , page )
112
124
text_path = File . join ( @output , "#{ @pdf_name } _#{ page } .txt" )
113
125
run "pdftotext -enc UTF-8 -f #{ page } -l #{ page } #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
126
+
114
127
unless @forbid_ocr
115
128
@pages_to_ocr . push ( page ) if File . read ( text_path ) . length < MIN_TEXT_PER_PAGE
116
129
end
130
+ return text_path
117
131
end
118
132
119
133
def extract_options ( options )
@@ -127,4 +141,4 @@ def extract_options(options)
127
141
128
142
end
129
143
130
- end
144
+ end
0 commit comments