tesseract single threaded

pull/11/head
phiresky 5 years ago
parent 1e9c2e45d6
commit 1f6e793a7f

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

@ -41,38 +41,6 @@ impl GetMetadata for PdfPagesAdapter {
}
}
/*// todo: do this in an actually streaming fashion and less slow
// IEND chunk + PDF magic
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
let split_seq_inx = 8;
fn split_by_seq<'a>(
split_seq: &'a [u8],
split_inx: usize,
read: &mut Read,
) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
let regex = split_seq
.iter()
.map(|c| format!("\\x{:0>2x}", c))
.collect::<Vec<_>>()
.join("");
let restr = format!("(?-u){}", regex);
eprintln!("re: {}", restr);
let re = regex::bytes::Regex::new(&restr)?;
let mut all = Vec::new();
read.read_to_end(&mut all)?;
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
let mut last = 0;
for (i, split) in re.find_iter(&all).enumerate() {
let pos = split.start() + split_inx;
out.push(Cursor::new(Vec::from(&all[last..pos])));
last = pos;
}
out.push(Cursor::new(Vec::from(&all[last..])));
Ok(out)
}*/
impl FileAdapter for PdfPagesAdapter {
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
let AdaptInfo {
@ -98,17 +66,13 @@ impl FileAdapter for PdfPagesAdapter {
let mut cmd = Command::new(exe_name);
cmd.arg("convert")
.arg("-density")
.arg("300")
.arg("200")
.arg(inp_fname)
.arg("+adjoin")
.arg(out_fname);
let mut cmd = cmd.spawn().map_err(|e| {
map_exe_error(
e,
exe_name,
"Could not find gm. Make sure you have graphicsmagick installed.",
)
map_exe_error(e, exe_name, "Make sure you have graphicsmagick installed.")
})?;
let args = config.args;
// TODO: how to handle this copying better?
@ -133,7 +97,7 @@ impl FileAdapter for PdfPagesAdapter {
is_real_file: false,
inp: &mut ele,
oup,
line_prefix,
line_prefix: &format!("{}Page {}:", line_prefix, i + 1),
archive_recursion_depth: archive_recursion_depth + 1,
config: PreprocConfig { cache: None, args },
})?;
@ -141,3 +105,35 @@ impl FileAdapter for PdfPagesAdapter {
Ok(())
}
}
/*// todo: do this in an actually streaming fashion and less slow
// IEND chunk + PDF magic
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
let split_seq_inx = 8;
fn split_by_seq<'a>(
split_seq: &'a [u8],
split_inx: usize,
read: &mut Read,
) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
let regex = split_seq
.iter()
.map(|c| format!("\\x{:0>2x}", c))
.collect::<Vec<_>>()
.join("");
let restr = format!("(?-u){}", regex);
eprintln!("re: {}", restr);
let re = regex::bytes::Regex::new(&restr)?;
let mut all = Vec::new();
read.read_to_end(&mut all)?;
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
let mut last = 0;
for (i, split) in re.find_iter(&all).enumerate() {
let pos = split.start() + split_inx;
out.push(Cursor::new(Vec::from(&all[last..pos])));
last = pos;
}
out.push(Cursor::new(Vec::from(&all[last..])));
Ok(out)
}*/

@ -36,7 +36,8 @@ impl SpawningFileAdapter for TesseractAdapter {
"tesseract"
}
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
cmd.arg("-").arg("-");
// rg already does threading
cmd.env("OMP_THREAD_LIMIT", "1").arg("-").arg("-");
cmd
}
}

Loading…
Cancel
Save