diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d61802..a9c12a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # 0.9.1 (2019-06-16) - Add enabled adapters to cache key if caching for archive +- Prevent empty trailing page output in pdf reader # 0.9.0 (2019-06-16) diff --git a/src/adapters.rs b/src/adapters.rs index 30af5ca..70868be 100644 --- a/src/adapters.rs +++ b/src/adapters.rs @@ -26,7 +26,9 @@ pub struct AdapterMeta { /// version identifier. used to key cache entries, change if your output format changes pub version: i32, pub description: String, - /// list of matchers (interpreted as ORed) + /// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters + pub recurses: bool, + /// list of matchers (interpreted as a OR b OR ...) pub fast_matchers: Vec, /// list of matchers when we have mime type detection active (interpreted as ORed) /// warning: this *overrides* the fast matchers @@ -71,7 +73,6 @@ pub struct AdaptInfo<'a> { pub oup: &'a mut (dyn Write + Send), /// prefix every output line with this string to better indicate the file's location if it is in some archive pub line_prefix: &'a str, - // pub adapt_subobject: &'a dyn Fn(AdaptInfo) -> Fallible<()>, pub config: PreprocConfig<'a>, } diff --git a/src/adapters/decompress.rs b/src/adapters/decompress.rs index d92c500..ccddce3 100644 --- a/src/adapters/decompress.rs +++ b/src/adapters/decompress.rs @@ -19,6 +19,7 @@ lazy_static! { description: "Reads compressed file as a stream and runs a different extractor on the contents." .to_owned(), + recurses: true, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) diff --git a/src/adapters/ffmpeg.rs b/src/adapters/ffmpeg.rs index 3649e59..a06f10c 100644 --- a/src/adapters/ffmpeg.rs +++ b/src/adapters/ffmpeg.rs @@ -16,6 +16,7 @@ lazy_static! { name: "ffmpeg".to_owned(), version: 1, description: "Uses ffmpeg to extract video metadata/chapters and subtitles".to_owned(), + recurses: false, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -47,7 +48,7 @@ struct FFprobeStream { codec_type: String, // video,audio,subtitle } impl FileAdapter for FFmpegAdapter { - fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { is_real_file, filepath_hint, diff --git a/src/adapters/pandoc.rs b/src/adapters/pandoc.rs index 26ef4a0..0eaf7ed 100644 --- a/src/adapters/pandoc.rs +++ b/src/adapters/pandoc.rs @@ -48,6 +48,7 @@ lazy_static! { description: "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text" .to_owned(), + recurses: false, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) diff --git a/src/adapters/pdfpages.rs b/src/adapters/pdfpages.rs index 346ea6f..9670221 100644 --- a/src/adapters/pdfpages.rs +++ b/src/adapters/pdfpages.rs @@ -17,6 +17,7 @@ lazy_static! { name: "pdfpages".to_owned(), version: 1, description: "Converts a pdf to it's individual pages as png files. Only useful in combination with tesseract".to_owned(), + recurses: true, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -42,7 +43,7 @@ impl GetMetadata for PdfPagesAdapter { /// A pdf is basically converted to a zip that has Page X.png files. /// This way, something like tesseract can process the pages individually impl FileAdapter for PdfPagesAdapter { - fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, is_real_file, diff --git a/src/adapters/poppler.rs b/src/adapters/poppler.rs index 1dc3424..c73a7ee 100644 --- a/src/adapters/poppler.rs +++ b/src/adapters/poppler.rs @@ -12,6 +12,7 @@ lazy_static! { version: 1, description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files" .to_owned(), + recurses: false, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -50,6 +51,9 @@ impl SpawningFileAdapter for PopplerAdapter { // page break line = line.replace('\x0c', ""); page += 1; + if line.is_empty() { + continue; + } } oup.write_all(format!("{}Page {}: {}\n", line_prefix, page, line).as_bytes())?; } diff --git a/src/adapters/spawning.rs b/src/adapters/spawning.rs index 8c446f3..340d26a 100644 --- a/src/adapters/spawning.rs +++ b/src/adapters/spawning.rs @@ -93,7 +93,7 @@ impl FileAdapter for T where T: SpawningFileAdapter, { - fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/adapters/sqlite.rs b/src/adapters/sqlite.rs index ba9d66f..520eb6e 100644 --- a/src/adapters/sqlite.rs +++ b/src/adapters/sqlite.rs @@ -14,6 +14,7 @@ lazy_static! { description: "Uses sqlite bindings to convert sqlite databases into a simple plain text format" .to_owned(), + recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think) fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -56,7 +57,7 @@ fn format_blob(b: ValueRef) -> String { } impl FileAdapter for SqliteAdapter { - fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { is_real_file, filepath_hint, diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index d490d00..7045832 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -13,6 +13,7 @@ lazy_static! { name: "tar".to_owned(), version: 1, description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), + recurses: true, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -35,7 +36,7 @@ impl GetMetadata for TarAdapter { } impl FileAdapter for TarAdapter { - fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/adapters/tesseract.rs b/src/adapters/tesseract.rs index 42ebe10..652634c 100644 --- a/src/adapters/tesseract.rs +++ b/src/adapters/tesseract.rs @@ -10,6 +10,7 @@ lazy_static! { name: "tesseract".to_owned(), version: 1, description: "Uses tesseract to run OCR on images to make them searchable. May need -j1 to prevent overloading the system. Make sure you have tesseract installed.".to_owned(), + recurses: false, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) diff --git a/src/adapters/zip.rs b/src/adapters/zip.rs index e9bcba5..4869662 100644 --- a/src/adapters/zip.rs +++ b/src/adapters/zip.rs @@ -14,6 +14,7 @@ lazy_static! { name: "zip".to_owned(), version: 1, description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), + recurses: true, fast_matchers: EXTENSIONS .iter() .map(|s| FastMatcher::FileExtension(s.to_string())) @@ -45,7 +46,7 @@ fn is_dir(f: &ZipFile) -> bool { } impl FileAdapter for ZipAdapter { - fn adapt(&self, ai: AdaptInfo, detection_reason: &SlowMatcher) -> Fallible<()> { + fn adapt(&self, ai: AdaptInfo, _detection_reason: &SlowMatcher) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, diff --git a/src/preproc.rs b/src/preproc.rs index 70719f6..55cde94 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -72,13 +72,22 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<(), Error> { let clean_path = filepath_hint.to_owned().clean(); let meta = std::fs::metadata(&filepath_hint)?; - let key = ( - clean_path, - meta.modified().expect("weird OS that can't into mtime"), - ); - eprintln!("cache key: {:?}", key); - - bincode::serialize(&key).expect("could not serialize path") // key in the cache database + if adapter.metadata().recurses { + let key = ( + clean_path, + meta.modified().expect("weird OS that can't into mtime"), + &args.adapters[..], + ); + eprintln!("cache key: {:?}", key); + bincode::serialize(&key).expect("could not serialize path") // key in the cache database + } else { + let key = ( + clean_path, + meta.modified().expect("weird OS that can't into mtime"), + ); + eprintln!("cache key: {:?}", key); + bincode::serialize(&key).expect("could not serialize path") // key in the cache database + } }; cache.write().unwrap().get_or_run( &db_name,