Skip to content

Commit

Permalink
update things add psm scaling and binarization options
Browse files Browse the repository at this point in the history
  • Loading branch information
bepvte committed Mar 17, 2024
1 parent eca7a3a commit 7949f7d
Show file tree
Hide file tree
Showing 8 changed files with 217 additions and 101 deletions.
158 changes: 77 additions & 81 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ dirs = "5.0.1"
itertools = "0.12.0"
kdam = { version = "0.5.1", features = ["rayon"] }
rayon = "1.8.1"
rusqlite = { version = "0.30.0", features = ["bundled"] }
rusqlite = { version = "0.31.0", features = ["bundled"] }
walkdir = "2.4.0"
glob = "0.3.1"
leptonica-plumbing = { version = "1.3.0", path = "libs/leptonica-plumbing" }
Expand Down
40 changes: 38 additions & 2 deletions libs/leptonica-plumbing/src/pix.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use leptonica_sys::{
l_int32, l_uint32, pixClone, pixDestroy, pixGetData, pixGetDepth, pixGetHeight, pixGetWidth,
pixRead, pixReadMem, pixReadWithHint,
l_float32, l_int32, l_uint32, pixClone, pixDestroy, pixGetData, pixGetDepth, pixGetHeight,
pixGetWidth, pixRead, pixReadMem, pixReadWithHint, pixScaleGeneral, pixTransferAllData,
};

use crate::memory::{LeptonicaClone, LeptonicaDestroy, RefCountedExclusive};
Expand Down Expand Up @@ -32,6 +32,14 @@ impl From<Infallible> for PixReadMemError {
#[error("Pix::read returned null")]
pub struct PixReadError();

#[derive(Debug, Error, PartialEq)]
pub enum PixManipError {
#[error("some internal data moving failed")]
PixInternalError,
#[error("Pix scaling returned null")]
PixScaleError,
}

impl AsRef<*mut leptonica_sys::Pix> for Pix {
fn as_ref(&self) -> &*mut leptonica_sys::Pix {
&self.0
Expand Down Expand Up @@ -110,6 +118,34 @@ impl Pix {
}
}

fn pix_transfer_data(
&mut self,
ptr: &mut *mut leptonica_sys::Pix,
) -> Result<(), PixManipError> {
let result = unsafe { pixTransferAllData(self.0, ptr, 0, 0) };
if result != 0 {
Err(PixManipError::PixInternalError)
} else {
Ok(())
}
}

/// Wrapper for [`pixScaleGeneral`](https://tpgit.github.io/Leptonica/leptprotos_8h.html#a2f8ea34f3d02024f5d42ca05d2961177)
pub fn scale_general(
&mut self,
scalex: l_float32,
scaley: l_float32,
) -> Result<(), PixManipError> {
let sharpwidth = if scalex.max(scaley) < 0.7 { 1 } else { 2 };
let mut ptr = unsafe { pixScaleGeneral(self.0, scalex, scaley, 0.0, sharpwidth) };
if ptr.is_null() {
Err(PixManipError::PixScaleError)
} else {
self.pix_transfer_data(&mut ptr)?;
Ok(())
}
}

/// Wrapper for [`pixGetHeight`](https://tpgit.github.io/Leptonica/pix1_8c.html#ae40704b3acbd343639e9aed696da531f)
pub fn get_height(&self) -> l_int32 {
unsafe { pixGetHeight(self.0) }
Expand Down
18 changes: 9 additions & 9 deletions libs/tesseract-sys/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,13 @@ fn find_tesseract_system_lib() -> Vec<String> {
use cmake::Config;
use std::process::Command;

const TESSERACT_VER: &str = "5.3.4";
let tesseract_ver = "2b07505e0e86026ae7c10767b334c337ccf06576";
let tesseract_url =
format!("https://github.com/tesseract-ocr/tesseract/archive/{tesseract_ver}.tar.gz");

let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
let tess_tgz = out_dir
.join(format!("tesseract-{}.tar.gz", TESSERACT_VER))
.join(format!("tesseract-{}.tar.gz", tesseract_ver))
.into_os_string()
.into_string()
.unwrap();
Expand All @@ -133,12 +135,10 @@ fn find_tesseract_system_lib() -> Vec<String> {
.current_dir(&out_dir)
.args([
"-z",
&format!("tesseract-{TESSERACT_VER}.tar.gz"),
&tess_tgz,
"-RsSfL",
"--tlsv1.2",
&format!(
"https://github.com/tesseract-ocr/tesseract/archive/refs/tags/{TESSERACT_VER}.tar.gz"
),
&tesseract_url,
"-o",
&tess_tgz,
])
Expand All @@ -158,7 +158,7 @@ fn find_tesseract_system_lib() -> Vec<String> {
}

let src_dir = out_dir
.join(format!("tesseract-{TESSERACT_VER}"))
.join(format!("tesseract-{tesseract_ver}"))
.to_owned();

let mut cm = Config::new(&src_dir);
Expand All @@ -171,11 +171,11 @@ fn find_tesseract_system_lib() -> Vec<String> {
"OFF"
},
)
.define("DISABLE_LEGACY_ENGINE", "ON")
.define("DISABLED_LEGACY_ENGINE", "ON")
.define("BUILD_TRAINING_TOOLS", "OFF")
.define("BUILD_SHARED_LIBS", "OFF")
.define("OPENMP_BUILD", "OFF")
.define("GRAPHICS_DISABLE", "ON")
.define("GRAPHICS_DISABLED", "ON")
.define("DISABLE_ARCHIVE", "ON")
.define("DISABLE_CURL", "ON")
// this flag disables tesseract recompressing every image as a png
Expand Down
1 change: 1 addition & 0 deletions src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ impl DB {
"#, match kind {
SearchType::Simple | SearchType::Match => "MATCH",
SearchType::Glob => "GLOB",
#[cfg(feature="regex")]
SearchType::Regex => "REGEXP"
}),
)
Expand Down
15 changes: 14 additions & 1 deletion src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use rayon::prelude::*;
use walkdir::WalkDir;

use crate::db::OcrResult;
use crate::ocr;
use crate::{db::DB, ocr::Ocr};

pub struct IndexOptions {
Expand All @@ -22,6 +23,9 @@ pub struct IndexOptions {
pub chunksize: usize,
pub cleanup: bool,
pub max_dimensions: Option<(usize, usize)>,
pub scale: Option<f32>,
pub binarization: Option<ocr::Binarization>,
pub psm: Option<i64>,
}

pub fn index_dir(db: &mut DB, path: &Path, options: IndexOptions) -> Result<()> {
Expand Down Expand Up @@ -132,7 +136,16 @@ pub fn index_dir(db: &mut DB, path: &Path, options: IndexOptions) -> Result<()>
let results: Vec<OcrResult> = chunk
.par_iter()
.map_init(
|| Ocr::new(&options.lang, options.debug).unwrap(),
|| {
Ocr::new(
&options.lang,
options.debug,
options.scale,
options.binarization,
options.psm,
)
.unwrap()
},
move |ocr, ele| {
if options.debug {
eprintln!("now working on {}", &ele.0);
Expand Down
28 changes: 26 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use glob::Pattern;
use itertools::Itertools;

use crate::db::{SearchType, DB};
use crate::ocr::Ocr;
use crate::ocr::{Binarization, Ocr};

// reading those images eats so much memory
#[cfg(not(target_env = "msvc"))]
Expand All @@ -29,7 +29,13 @@ fn main() -> Result<()> {
let matches = cli().get_matches();

if matches.get_flag("dump-scan") {
let mut o = Ocr::new(matches.get_one::<String>("lang").unwrap(), true)?;
let mut o = Ocr::new(
matches.get_one::<String>("lang").unwrap(),
true,
matches.get_one::<f32>("scale").copied(),
matches.get_one::<Binarization>("binarization").copied(),
matches.get_one::<i64>("psm").copied(),
)?;
let path = PathBuf::from(
matches
.get_one::<String>("QUERIES")
Expand Down Expand Up @@ -97,6 +103,9 @@ fn main() -> Result<()> {
chunksize: *matches.get_one::<usize>("chunk-size").unwrap(),
cleanup: matches.get_flag("cleanup"),
max_dimensions: max_size,
scale: matches.get_one::<f32>("scale").copied(),
binarization: matches.get_one::<Binarization>("binarization").copied(),
psm: matches.get_one::<i64>("psm").copied(),
},
)?;
}
Expand Down Expand Up @@ -168,6 +177,7 @@ Matched directories will not be descended into. Excluded items will be removed
r#"Type of query to search. Default is to search for any instance of a literal value (`simple`)
`simple`: Passes sqlite fts5 the queries combined into one search phrase, i.e. `ocrlocate one two` matches "needleone twoneedle"
`match`: Passes sqlite fts5 the argument as an unescaped match query: https://www.sqlite.org/fts5.html#full_text_query_syntax.
This is most useful if you are looking for a result that matches several terms anywhere, not just one long term
Note that all queries are prefix queries with the tokenizer we use.
Examples: `ocrlocate -s match one AND '"AND"'`
`ocrlocate -s match needle NOT dontfind`
Expand All @@ -187,6 +197,20 @@ Matched directories will not be descended into. Excluded items will be removed
_ => unreachable!()
}
})),
arg!(--binarization <METHOD> "Which leptonica thresholding method to use")
.value_parser(PossibleValuesParser::new(["Otsu", "LeptonicaOtsu", "Sauvola"]).map(|x| -> Binarization {
match x.as_str() {
"Otsu" => Binarization::Otsu,
"LeptonicaOtsu" => Binarization::LeptonicaOtsu,
"Sauvola" => Binarization::Sauvola,
_ => unreachable!()
}
})),
arg!(--psm <PSM> "Page segmentation mode").long_help(r#"Page segmentation mode
Documentation of values here: https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html#page-segmentation-method"#
).value_parser(value_parser!(i64).range(0..=13)).default_value("11"),
// TODO: scale by max size, scale to res, etc
arg!(--scale <FRAC> "Fraction to scale all images down by before applying ocr").value_parser(value_parser!(f32)),
arg!(--pwd <PWD> "Set pwd").hide(true),
arg!(--"scan-limit" <LIMIT> "Set max amount of scanned files")
.hide(true)
Expand Down
56 changes: 51 additions & 5 deletions src/ocr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,24 @@ use leptonica_plumbing::{self, leptonica_sys};
#[derive(Debug)]
pub struct Ocr {
leptess: TessApi,
scale: Option<f32>,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Binarization {
Otsu = 0,
LeptonicaOtsu = 1,
Sauvola = 2,
}

impl Ocr {
pub fn new(lang: &str, debug: bool) -> Result<Self> {
pub fn new(
lang: &str,
debug: bool,
scale: Option<f32>,
binarization: Option<Binarization>,
psm: Option<i64>,
) -> Result<Self> {
if lang.len() != 3 || lang.contains(['.', '/', '\\']) || !lang.is_ascii() {
return Err(anyhow!("Invalid language code: {:?}", lang));
}
Expand All @@ -28,21 +42,53 @@ impl Ocr {
.unwrap();
set_log_level(leptonica_sys::L_SEVERITY_ERROR);
}
if let Some(binarization) = binarization {
leptess
.raw
.set_variable(
&CString::new("thresholding_method").unwrap(),
&CString::new((binarization as u8).to_string()).unwrap(),
)
.unwrap();
}
if let Some(psm) = psm {
leptess.raw.set_page_seg_mode(psm.try_into().unwrap());
}

Ok(Ocr { leptess })
leptess
.raw
.set_variable(
leptess::Variable::TesseditPagesegMode.as_cstr(),
&CString::new("11").unwrap(),
)
.unwrap();

leptess
.raw
.set_variable(
leptess::Variable::TesseditCharBlacklist.as_cstr(),
&CString::new("|®»«®©").unwrap(),
)
.unwrap();

Ok(Ocr { leptess, scale })
}
pub fn scan(&mut self, img: &Path) -> Result<String> {
let filename = CString::new(img.as_str()).expect("null in filename");
let cpix = leptonica_plumbing::Pix::read_with_hint(
let mut cpix = leptonica_plumbing::Pix::read_with_hint(
&filename,
leptonica_sys::L_JPEG_CONTINUE_WITH_BAD_DATA,
)?;

if let Some(scale) = self.scale {
cpix.scale_general(scale, scale)?;
}

self.leptess.set_image(&leptess::leptonica::Pix {
raw: cpix.to_ref_counted(),
});

Ok(self.leptess.get_utf8_text()?)
Ok(self.leptess.get_utf8_text()?.replace("\n\n", "\n"))
}
}

Expand Down Expand Up @@ -82,7 +128,7 @@ mod tests {
#[test]
#[ignore]
fn scan() -> Result<()> {
let mut ocr = Ocr::new("eng", true).unwrap();
let mut ocr = Ocr::new("eng", true, None, None, Some(11)).unwrap();
let image = test_image();
let result = ocr.scan(Path::from_path(&image).unwrap()).unwrap();
assert!(result.contains("needle"));
Expand Down

0 comments on commit 7949f7d

Please sign in to comment.