diff --git a/Cargo.lock b/Cargo.lock index b135fbe..b60f5c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,6 +131,16 @@ name = "cfg-if" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "chrono" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "clap" version = "2.33.0" @@ -483,6 +493,15 @@ name = "nom" version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "num-integer" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "num-traits" version = "0.2.8" @@ -752,11 +771,12 @@ dependencies = [ [[package]] name = "rga" -version = "0.1.0" +version = "0.2.0" dependencies = [ "bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", "cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1127,6 +1147,7 @@ dependencies = [ "checksum cachedir 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c06509d1f4ffa658939bd23f076cd929ef218241363796551528e7eec69128c8" "checksum cc 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "39f75544d7bbaf57560d2168f28fd649ff9c76153874db88bdbdfd839b1a7e7d" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" +"checksum chrono 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "45912881121cb26fad7c38c17ba7daa18764771836b34fab7d3fbd93ed633878" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" @@ -1169,6 +1190,7 @@ dependencies = [ "checksum miniz_oxide_c_api 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b7fe927a42e3807ef71defb191dc87d4e24479b221e67015fe38ae2b7b447bab" "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" "checksum nom 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf51a729ecf40266a2368ad335a5fdde43471f545a967109cd62146ecf8b66ff" +"checksum num-integer 0.1.41 (registry+https://github.com/rust-lang/crates.io-index)" = "b85e541ef8255f6cf42bbfe4ef361305c6c135d10919ecc26126c4e5ae94bc09" "checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32" "checksum num_cpus 1.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1a23f0ed30a54abaa0c7e83b1d2d87ada7c3c23078d1d87815af3e3b6385fbba" "checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" diff --git a/Cargo.toml b/Cargo.toml index 832df00..a9542c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ cargo-features = ["default-run"] name = "rga" description = "ripgrep, except for pdf, ebooks, Office documents, etc" license = "AGPL-3.0-or-later" -version = "0.1.0" +version = "0.2.0" repository = "https://github.com/phiresky/rga" authors = ["phiresky "] edition = "2018" @@ -37,3 +37,4 @@ xz2 = "0.1.6" flate2 = "1.0.7" bzip2 = "0.3.3" tar = "0.4.26" +chrono = "0.4.6" diff --git a/README.md b/README.md index 9ce4aed..6aebb35 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,10 @@ similar: - pdfgrep - https://gist.github.com/ColonolBuendia/314826e37ec35c616d70506c38dc65aa +# todo + +- jpg adapter (based on object classification / detection (yolo?)) for fun + # considerations - matching on mime (magic bytes) instead of filename diff --git a/exampledir/test.tar b/exampledir/test.tar new file mode 100644 index 0000000..017368c Binary files /dev/null and b/exampledir/test.tar differ diff --git a/exampledir/test.zip b/exampledir/test.zip index ad83434..af8508c 100644 Binary files a/exampledir/test.zip and b/exampledir/test.zip differ diff --git a/src/adapters/tar.rs b/src/adapters/tar.rs index 2ae4ce6..dc8d429 100644 --- a/src/adapters/tar.rs +++ b/src/adapters/tar.rs @@ -4,6 +4,7 @@ use ::tar::EntryType::Regular; use failure::*; use lazy_static::lazy_static; use std::fs::File; +use std::io::BufReader; use std::path::PathBuf; static EXTENSIONS: &[&str] = &["tar", "tar.gz", "tar.bz2", "tar.xz", "tar.zst"]; @@ -31,30 +32,59 @@ impl GetMetadata for TarAdapter { &METADATA } } -/*struct WrapRead<'a> { - inner: &mut 'a Read; + +// make a &mut Read into a owned Read because the streaming decompressors want to take ownership of their base Reads +struct WrapRead<'a> { + inner: &'a mut dyn Read, +} +impl<'a> Read for WrapRead<'a> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.inner.read(buf) + } } -impl Read for WrapRead { - r -}*/ -/*fn decompress_any(filename: &Path, inp: &mut Read) -> Fallible> { +// feeling a little stupid here. why is this needed at all +enum SpecRead { + Gz(flate2::read::MultiGzDecoder), + Bz2(bzip2::read::BzDecoder), + Xz(xz2::read::XzDecoder), + Zst(zstd::stream::read::Decoder>), + Passthrough(R), +} +impl Read for SpecRead { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + use SpecRead::*; + match self { + Gz(z) => z.read(buf), + Bz2(z) => z.read(buf), + Xz(z) => z.read(buf), + Zst(z) => z.read(buf), + Passthrough(z) => z.read(buf), + } + } +} +// why do I need to wrap the output here in a specific type? is it possible with just a Box for every type? +fn decompress_any<'a, R>(filename: &Path, inp: &'a mut R) -> Fallible>> +where + R: Read, +{ + let inp = WrapRead { inner: inp }; let extension = filename.extension().map(|e| e.to_string_lossy().to_owned()); match extension { Some(e) => Ok(match e.to_owned().as_ref() { - "gz" => Box::new(flate2::read::MultiGzDecoder::new(inp)), - "bz2" => Box::new(bzip2::read::BzDecoder::new(inp)), - "xz" => Box::new(xz2::read::XzDecoder::new_multi_decoder(inp)), - "zst" => Box::new(zstd::stream::read::Decoder::new(inp)?), - e => Err(format_err!("don't know how to decompress {}", e))?, + "gz" => SpecRead::Gz(flate2::read::MultiGzDecoder::new(inp)), + "bz2" => SpecRead::Bz2(bzip2::read::BzDecoder::new(inp)), + "xz" => SpecRead::Xz(xz2::read::XzDecoder::new_multi_decoder(inp)), + "zst" => SpecRead::Zst(zstd::stream::read::Decoder::new(inp)?), + "tar" => SpecRead::Passthrough(inp), + ext => Err(format_err!("don't know how to decompress {}", ext))?, }), None => Err(format_err!("no extension")), } -}*/ +} impl FileAdapter for TarAdapter { - fn adapt<'a>(&self, ai: AdaptInfo) -> Fallible<()> { - use std::io::prelude::*; + fn adapt(&self, ai: AdaptInfo) -> Fallible<()> { let AdaptInfo { filepath_hint, mut inp, @@ -62,7 +92,8 @@ impl FileAdapter for TarAdapter { line_prefix, .. } = ai; - let decompress = inp; //decompress_any(filepath_hint, &inp)?; + + let decompress = decompress_any(filepath_hint, &mut inp)?; let mut archive = ::tar::Archive::new(decompress); for entry in archive.entries()? { let mut file = entry.unwrap(); @@ -75,15 +106,13 @@ impl FileAdapter for TarAdapter { ); if Regular == file.header().entry_type() { let line_prefix = &format!("{}{}: ", line_prefix, path.display()); - rga_preproc( - AdaptInfo { - filepath_hint: &path, - inp: &mut file, - oup: oup, - line_prefix, - }, - None, - )?; + let ai2: AdaptInfo = AdaptInfo { + filepath_hint: &path, + inp: &mut file, + oup: oup, + line_prefix, + }; + rga_preproc(ai2, None)?; } } Ok(()) diff --git a/src/bin/rga-preproc.rs b/src/bin/rga-preproc.rs index 080f135..c943849 100644 --- a/src/bin/rga-preproc.rs +++ b/src/bin/rga-preproc.rs @@ -24,9 +24,8 @@ fn main() -> Result<(), Error> { }; let cache_db = match env::var("RGA_NO_CACHE") { - Ok(ref s) if s.len() > 0 => Some(open_cache_db()?), - Ok(_) => None, - Err(_) => None, + Ok(ref s) if s.len() > 0 => None, + Ok(_) | Err(_) => Some(open_cache_db()?), }; rga_preproc(ai, cache_db) diff --git a/src/preproc.rs b/src/preproc.rs index 35f5c5d..dd89a1f 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -2,7 +2,9 @@ use crate::adapters::*; use crate::CachingWriter; use failure::{format_err, Error}; use path_clean::PathClean; +use std::fs::File; use std::io::Read; +use std::io::Write; use std::path::Path; use std::path::PathBuf; use std::rc::Rc; @@ -21,16 +23,21 @@ pub fn open_cache_db() -> Result>, Er let mut builder = rkv::Rkv::environment_builder(); builder .set_flags(rkv::EnvironmentFlags::NO_SYNC | rkv::EnvironmentFlags::WRITE_MAP) // not durable + // i'm not sure why this is needed. otherwise LMDB transactions (open readers) will keep piling up until it fails with + // LmdbError(ReadersFull) + // hope it doesn't break integrity + .set_flags(rkv::EnvironmentFlags::NO_TLS) .set_map_size(2 * 1024 * 1024 * 1024) - .set_max_dbs(100); + .set_max_dbs(100) + .set_max_readers(128); rkv::Rkv::from_env(p, builder) }) .expect("could not get/create db"); Ok(db_arc) } -pub fn rga_preproc( - ai: AdaptInfo, +pub fn rga_preproc<'a>( + ai: AdaptInfo<'a>, mb_db_arc: Option>>, ) -> Result<(), Error> { let adapters = adapter_matcher()?; @@ -79,6 +86,7 @@ pub fn rga_preproc( let db = db_env .open_single(db_name.as_str(), rkv::store::Options::create()) .map_err(|p| format_err!("could not open db store: {:?}", p))?; + let reader = db_env.read().expect("could not get reader"); let cached = db .get(&reader, &cache_key) @@ -91,6 +99,7 @@ pub fn rga_preproc( } Some(_) => Err(format_err!("Integrity: value not blob")), None => { + drop(reader); let mut compbuf = CachingWriter::new(oup, MAX_DB_BLOB_LEN, ZSTD_LEVEL)?; // start dupe eprintln!("adapting...");