add tesseract adapter
parent
d1b55e80b7
commit
1e9c2e45d6
@ -0,0 +1,143 @@
|
||||
use super::*;
|
||||
use crate::adapters::spawning::map_exe_error;
|
||||
use crate::adapters::spawning::pipe_output;
|
||||
use crate::preproc::rga_preproc;
|
||||
use lazy_static::lazy_static;
|
||||
use spawning::SpawningFileAdapter;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::io::Cursor;
|
||||
use std::io::Take;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::process::Stdio;
|
||||
|
||||
static EXTENSIONS: &[&str] = &["pdf"];
|
||||
|
||||
lazy_static! {
|
||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||
name: "pdfpages".to_owned(),
|
||||
version: 1,
|
||||
description: "Converts a pdf to it's individual pages as png files".to_owned(),
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
pub struct PdfPagesAdapter {}
|
||||
|
||||
impl PdfPagesAdapter {
|
||||
pub fn new() -> PdfPagesAdapter {
|
||||
PdfPagesAdapter {}
|
||||
}
|
||||
}
|
||||
|
||||
impl GetMetadata for PdfPagesAdapter {
|
||||
fn metadata(&self) -> &AdapterMeta {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
|
||||
/*// todo: do this in an actually streaming fashion and less slow
|
||||
// IEND chunk + PDF magic
|
||||
// 4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a
|
||||
let split_seq = hex_literal::hex!("4945 4e44 ae42 6082 8950 4e47 0d0a 1a0a");
|
||||
let split_seq_inx = 8;
|
||||
fn split_by_seq<'a>(
|
||||
split_seq: &'a [u8],
|
||||
split_inx: usize,
|
||||
read: &mut Read,
|
||||
) -> Fallible<impl IntoIterator<Item = impl Read> + 'a> {
|
||||
let regex = split_seq
|
||||
.iter()
|
||||
.map(|c| format!("\\x{:0>2x}", c))
|
||||
.collect::<Vec<_>>()
|
||||
.join("");
|
||||
let restr = format!("(?-u){}", regex);
|
||||
eprintln!("re: {}", restr);
|
||||
let re = regex::bytes::Regex::new(&restr)?;
|
||||
|
||||
let mut all = Vec::new();
|
||||
read.read_to_end(&mut all)?;
|
||||
let mut out: Vec<Cursor<Vec<u8>>> = Vec::new();
|
||||
let mut last = 0;
|
||||
for (i, split) in re.find_iter(&all).enumerate() {
|
||||
let pos = split.start() + split_inx;
|
||||
out.push(Cursor::new(Vec::from(&all[last..pos])));
|
||||
last = pos;
|
||||
}
|
||||
out.push(Cursor::new(Vec::from(&all[last..])));
|
||||
Ok(out)
|
||||
}*/
|
||||
|
||||
impl FileAdapter for PdfPagesAdapter {
|
||||
fn adapt(&self, ai: AdaptInfo) -> Fallible<()> {
|
||||
let AdaptInfo {
|
||||
filepath_hint,
|
||||
is_real_file,
|
||||
mut inp,
|
||||
oup,
|
||||
line_prefix,
|
||||
archive_recursion_depth,
|
||||
config,
|
||||
..
|
||||
} = ai;
|
||||
if !is_real_file {
|
||||
// todo: read to memory and then use that blob if size < max
|
||||
writeln!(oup, "{}[rga: skipping pdfpages in archive]", line_prefix,)?;
|
||||
return Ok(());
|
||||
}
|
||||
let inp_fname = filepath_hint;
|
||||
let exe_name = "gm";
|
||||
let out_dir = tempfile::Builder::new().prefix("pdfpages-").tempdir()?;
|
||||
let out_fname = out_dir.path().join("out%04d.png");
|
||||
eprintln!("writing to temp dir: {}", out_fname.display());
|
||||
let mut cmd = Command::new(exe_name);
|
||||
cmd.arg("convert")
|
||||
.arg("-density")
|
||||
.arg("300")
|
||||
.arg(inp_fname)
|
||||
.arg("+adjoin")
|
||||
.arg(out_fname);
|
||||
|
||||
let mut cmd = cmd.spawn().map_err(|e| {
|
||||
map_exe_error(
|
||||
e,
|
||||
exe_name,
|
||||
"Could not find gm. Make sure you have graphicsmagick installed.",
|
||||
)
|
||||
})?;
|
||||
let args = config.args;
|
||||
// TODO: how to handle this copying better?
|
||||
|
||||
let status = cmd.wait()?;
|
||||
if status.success() {
|
||||
} else {
|
||||
return Err(format_err!("subprocess failed: {:?}", status));
|
||||
}
|
||||
for (i, filename) in glob::glob(
|
||||
out_dir
|
||||
.path()
|
||||
.join("out*.png")
|
||||
.to_str()
|
||||
.expect("temp path has invalid encoding"),
|
||||
)?
|
||||
.enumerate()
|
||||
{
|
||||
let mut ele = BufReader::new(File::open(filename?)?);
|
||||
rga_preproc(AdaptInfo {
|
||||
filepath_hint: &PathBuf::from(format!("Page {}.png", i + 1)),
|
||||
is_real_file: false,
|
||||
inp: &mut ele,
|
||||
oup,
|
||||
line_prefix,
|
||||
archive_recursion_depth: archive_recursion_depth + 1,
|
||||
config: PreprocConfig { cache: None, args },
|
||||
})?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
use super::*;
|
||||
use lazy_static::lazy_static;
|
||||
use spawning::SpawningFileAdapter;
|
||||
use std::process::Command;
|
||||
|
||||
static EXTENSIONS: &[&str] = &["jpg", "png"];
|
||||
|
||||
lazy_static! {
|
||||
static ref METADATA: AdapterMeta = AdapterMeta {
|
||||
name: "tesseract".to_owned(),
|
||||
version: 1,
|
||||
description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(),
|
||||
fast_matchers: EXTENSIONS
|
||||
.iter()
|
||||
.map(|s| FastMatcher::FileExtension(s.to_string()))
|
||||
.collect(),
|
||||
slow_matchers: None
|
||||
};
|
||||
}
|
||||
#[derive(Default)]
|
||||
pub struct TesseractAdapter {}
|
||||
|
||||
impl TesseractAdapter {
|
||||
pub fn new() -> TesseractAdapter {
|
||||
TesseractAdapter {}
|
||||
}
|
||||
}
|
||||
|
||||
impl GetMetadata for TesseractAdapter {
|
||||
fn metadata(&self) -> &AdapterMeta {
|
||||
&METADATA
|
||||
}
|
||||
}
|
||||
impl SpawningFileAdapter for TesseractAdapter {
|
||||
fn get_exe(&self) -> &str {
|
||||
"tesseract"
|
||||
}
|
||||
fn command(&self, _filepath_hint: &Path, mut cmd: Command) -> Command {
|
||||
cmd.arg("-").arg("-");
|
||||
cmd
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue