diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 93973f6..e78b832 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -76,7 +76,7 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" + run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index fa46180..eaa9f56 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -49,11 +49,16 @@ def test_password_protected end end - def test_name_escaping_while_extracting_text + def test_name_escaping_while_extracting_text_into_pages Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 2 end - + + def test_name_escaping_while_extracting_text_using_ocr + Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :ocr => true, :output => OUTPUT) + assert Dir["#{OUTPUT}/*.txt"].length == 1 + end + def test_orientation_detected_ocr_extraction if Docsplit::DEPENDENCIES[:osd] pages = 1..4