|
|
|
@ -1,4 +1,6 @@
|
|
|
|
|
use crate::adapters::*;
|
|
|
|
|
use crate::config::RgaConfig;
|
|
|
|
|
use crate::recurse::concat_read_streams;
|
|
|
|
|
use crate::{matching::*, recurse::RecursingConcattyReader};
|
|
|
|
|
use crate::{
|
|
|
|
|
preproc_cache::{LmdbCache, PreprocCache},
|
|
|
|
@ -7,47 +9,32 @@ use crate::{
|
|
|
|
|
use anyhow::*;
|
|
|
|
|
use log::*;
|
|
|
|
|
use path_clean::PathClean;
|
|
|
|
|
use postproc::PostprocPrefix;
|
|
|
|
|
// use postproc::PostprocPrefix;
|
|
|
|
|
use std::convert::TryInto;
|
|
|
|
|
|
|
|
|
|
use std::io::{BufRead, BufReader};
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
use tokio::io::AsyncBufReadExt;
|
|
|
|
|
use tokio::io::BufReader;
|
|
|
|
|
use tokio::io::{AsyncBufRead, AsyncRead};
|
|
|
|
|
|
|
|
|
|
use std::{rc::Rc, time::Instant};
|
|
|
|
|
/**
|
|
|
|
|
* preprocess a file as defined in `ai`.
|
|
|
|
|
*
|
|
|
|
|
* If a cache is passed, read/write to it.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
pub fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
|
|
|
|
|
let AdaptInfo {
|
|
|
|
|
filepath_hint,
|
|
|
|
|
is_real_file,
|
|
|
|
|
inp,
|
|
|
|
|
line_prefix,
|
|
|
|
|
config,
|
|
|
|
|
archive_recursion_depth,
|
|
|
|
|
postprocess,
|
|
|
|
|
} = ai;
|
|
|
|
|
debug!("path (hint) to preprocess: {:?}", filepath_hint);
|
|
|
|
|
let filtered_adapters =
|
|
|
|
|
get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?;
|
|
|
|
|
let adapters = adapter_matcher(&filtered_adapters, config.accurate)?;
|
|
|
|
|
|
|
|
|
|
type ActiveAdapters = Vec<(Rc<dyn FileAdapter>)>;
|
|
|
|
|
|
|
|
|
|
async fn choose_adapter(
|
|
|
|
|
config: &RgaConfig,
|
|
|
|
|
filepath_hint: &Path,
|
|
|
|
|
archive_recursion_depth: i32,
|
|
|
|
|
mut inp: &mut (impl AsyncBufRead + Unpin),
|
|
|
|
|
) -> Result<Option<(Rc<dyn FileAdapter>, FileMatcher, ActiveAdapters)>> {
|
|
|
|
|
let active_adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?;
|
|
|
|
|
let adapters = adapter_matcher(&active_adapters, config.accurate)?;
|
|
|
|
|
let filename = filepath_hint
|
|
|
|
|
.file_name()
|
|
|
|
|
.ok_or_else(|| format_err!("Empty filename"))?;
|
|
|
|
|
debug!("Archive recursion depth: {}", archive_recursion_depth);
|
|
|
|
|
if archive_recursion_depth >= config.max_archive_recursion.0 {
|
|
|
|
|
let s = format!("{}[rga: max archive recursion reached]", line_prefix).into_bytes();
|
|
|
|
|
return Ok(Box::new(std::io::Cursor::new(s)));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// todo: figure out when using a bufreader is a good idea and when it is not
|
|
|
|
|
// seems to be good for File::open() reads, but not sure about within archives (tar, zip)
|
|
|
|
|
let mut inp = BufReader::with_capacity(1 << 16, inp);
|
|
|
|
|
|
|
|
|
|
let mimetype = if config.accurate {
|
|
|
|
|
let buf = inp.fill_buf()?; // fill but do not consume!
|
|
|
|
|
let buf = inp.fill_buf().await?; // fill but do not consume!
|
|
|
|
|
let mimetype = tree_magic::from_u8(buf);
|
|
|
|
|
debug!("mimetype: {:?}", mimetype);
|
|
|
|
|
Some(mimetype)
|
|
|
|
@ -58,52 +45,105 @@ pub fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
|
|
|
|
|
mimetype,
|
|
|
|
|
lossy_filename: filename.to_string_lossy().to_string(),
|
|
|
|
|
});
|
|
|
|
|
let (adapter, detection_reason) = match adapter {
|
|
|
|
|
Some((a, d)) => (a, d),
|
|
|
|
|
Ok(adapter.map(|e| (e.0, e.1, active_adapters)))
|
|
|
|
|
}
|
|
|
|
|
/**
|
|
|
|
|
* preprocess a file as defined in `ai`.
|
|
|
|
|
*
|
|
|
|
|
* If a cache is passed, read/write to it.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
pub async fn rga_preproc(ai: AdaptInfo<'_>) -> Result<ReadBox<'_>> {
|
|
|
|
|
debug!("path (hint) to preprocess: {:?}", ai.filepath_hint);
|
|
|
|
|
/*todo: move if archive_recursion_depth >= config.max_archive_recursion.0 {
|
|
|
|
|
let s = format!("{}[rga: max archive recursion reached]", line_prefix).into_bytes();
|
|
|
|
|
return Ok(Box::new(std::io::Cursor::new(s)));
|
|
|
|
|
}*/
|
|
|
|
|
|
|
|
|
|
// todo: figure out when using a bufreader is a good idea and when it is not
|
|
|
|
|
// seems to be good for File::open() reads, but not sure about within archives (tar, zip)
|
|
|
|
|
let mut inp = BufReader::with_capacity(1 << 16, ai.inp);
|
|
|
|
|
let adapter = choose_adapter(
|
|
|
|
|
&ai.config,
|
|
|
|
|
&ai.filepath_hint,
|
|
|
|
|
ai.archive_recursion_depth,
|
|
|
|
|
&mut inp,
|
|
|
|
|
)
|
|
|
|
|
.await?;
|
|
|
|
|
let (adapter, detection_reason, active_adapters) = match adapter {
|
|
|
|
|
Some((a, d, e)) => (a, d, e),
|
|
|
|
|
None => {
|
|
|
|
|
// allow passthrough if the file is in an archive or accurate matching is enabled
|
|
|
|
|
// otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us
|
|
|
|
|
let allow_cat = !is_real_file || config.accurate;
|
|
|
|
|
let allow_cat = !ai.is_real_file || ai.config.accurate;
|
|
|
|
|
if allow_cat {
|
|
|
|
|
if postprocess {
|
|
|
|
|
(
|
|
|
|
|
if ai.postprocess {
|
|
|
|
|
panic!("not implemented");
|
|
|
|
|
/* (
|
|
|
|
|
Rc::new(PostprocPrefix {}) as Rc<dyn FileAdapter>,
|
|
|
|
|
FileMatcher::Fast(FastFileMatcher::FileExtension("default".to_string())), // todo: separate enum value for this
|
|
|
|
|
)
|
|
|
|
|
)*/
|
|
|
|
|
} else {
|
|
|
|
|
return Ok(Box::new(inp));
|
|
|
|
|
return Ok(Box::pin(inp));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
return Err(format_err!(
|
|
|
|
|
"No adapter found for file {:?}, passthrough disabled.",
|
|
|
|
|
filename
|
|
|
|
|
ai.filepath_hint
|
|
|
|
|
.file_name()
|
|
|
|
|
.ok_or_else(|| format_err!("Empty filename"))?
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
let path_hint_copy = filepath_hint.clone();
|
|
|
|
|
run_adapter(
|
|
|
|
|
let path_hint_copy = ai.filepath_hint.clone();
|
|
|
|
|
run_adapter_recursively(
|
|
|
|
|
AdaptInfo {
|
|
|
|
|
filepath_hint,
|
|
|
|
|
is_real_file,
|
|
|
|
|
inp: Box::new(inp),
|
|
|
|
|
line_prefix,
|
|
|
|
|
config,
|
|
|
|
|
archive_recursion_depth,
|
|
|
|
|
postprocess,
|
|
|
|
|
inp: Box::pin(inp),
|
|
|
|
|
..ai
|
|
|
|
|
},
|
|
|
|
|
adapter,
|
|
|
|
|
detection_reason,
|
|
|
|
|
&filtered_adapters,
|
|
|
|
|
active_adapters,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn run_adapter<'a>(
|
|
|
|
|
fn compute_cache_key(
|
|
|
|
|
filepath_hint: &Path,
|
|
|
|
|
adapter: &dyn FileAdapter,
|
|
|
|
|
active_adapters: ActiveAdapters,
|
|
|
|
|
) -> Result<Vec<u8>> {
|
|
|
|
|
let clean_path = filepath_hint.to_owned().clean();
|
|
|
|
|
let meta = std::fs::metadata(&filepath_hint)
|
|
|
|
|
.with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
|
|
|
|
|
let modified = meta.modified().expect("weird OS that can't into mtime");
|
|
|
|
|
|
|
|
|
|
if adapter.metadata().recurses {
|
|
|
|
|
let active_adapters_cache_key = active_adapters
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|a| (a.metadata().name.clone(), a.metadata().version))
|
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
let key = (active_adapters_cache_key, clean_path, modified);
|
|
|
|
|
debug!("Cache key (with recursion): {:?}", key);
|
|
|
|
|
bincode::serialize(&key).context("could not serialize path")
|
|
|
|
|
} else {
|
|
|
|
|
let key = (
|
|
|
|
|
adapter.metadata().name.clone(),
|
|
|
|
|
adapter.metadata().version,
|
|
|
|
|
clean_path,
|
|
|
|
|
modified,
|
|
|
|
|
);
|
|
|
|
|
debug!("Cache key (no recursion): {:?}", key);
|
|
|
|
|
bincode::serialize(&key).context("could not serialize path")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
async fn run_adapter_recursively<'a>(
|
|
|
|
|
ai: AdaptInfo<'a>,
|
|
|
|
|
adapter: Rc<dyn FileAdapter>,
|
|
|
|
|
detection_reason: FileMatcher,
|
|
|
|
|
filtered_adapters: &Vec<Rc<dyn FileAdapter>>,
|
|
|
|
|
active_adapters: ActiveAdapters,
|
|
|
|
|
) -> Result<ReadBox<'a>> {
|
|
|
|
|
let AdaptInfo {
|
|
|
|
|
filepath_hint,
|
|
|
|
@ -134,116 +174,56 @@ fn run_adapter<'a>(
|
|
|
|
|
None
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if let Some(mut cache) = cache {
|
|
|
|
|
let cache_key: Vec<u8> = {
|
|
|
|
|
let clean_path = filepath_hint.to_owned().clean();
|
|
|
|
|
let meta = std::fs::metadata(&filepath_hint).with_context(|| {
|
|
|
|
|
format!("reading metadata for {}", filepath_hint.to_string_lossy())
|
|
|
|
|
})?;
|
|
|
|
|
let modified = meta.modified().expect("weird OS that can't into mtime");
|
|
|
|
|
|
|
|
|
|
if adapter.metadata().recurses {
|
|
|
|
|
let key = (
|
|
|
|
|
filtered_adapters
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|a| (a.metadata().name.clone(), a.metadata().version))
|
|
|
|
|
.collect::<Vec<_>>(),
|
|
|
|
|
clean_path,
|
|
|
|
|
modified,
|
|
|
|
|
);
|
|
|
|
|
debug!("Cache key (with recursion): {:?}", key);
|
|
|
|
|
bincode::serialize(&key).expect("could not serialize path")
|
|
|
|
|
} else {
|
|
|
|
|
let key = (
|
|
|
|
|
adapter.metadata().name.clone(),
|
|
|
|
|
adapter.metadata().version,
|
|
|
|
|
clean_path,
|
|
|
|
|
modified,
|
|
|
|
|
);
|
|
|
|
|
debug!("Cache key (no recursion): {:?}", key);
|
|
|
|
|
bincode::serialize(&key).expect("could not serialize path")
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
// let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
|
|
|
|
|
let cached = cache.get(&db_name, &cache_key)?;
|
|
|
|
|
match cached {
|
|
|
|
|
Some(cached) => Ok(Box::new(
|
|
|
|
|
zstd::stream::read::Decoder::new(std::io::Cursor::new(cached))
|
|
|
|
|
.context("could not create zstd decoder")?,
|
|
|
|
|
)),
|
|
|
|
|
None => {
|
|
|
|
|
debug!("cache MISS, running adapter");
|
|
|
|
|
debug!("adapting with caching...");
|
|
|
|
|
let inp = adapter
|
|
|
|
|
.adapt(
|
|
|
|
|
AdaptInfo {
|
|
|
|
|
line_prefix,
|
|
|
|
|
filepath_hint: filepath_hint.clone(),
|
|
|
|
|
is_real_file,
|
|
|
|
|
inp: Box::new(inp),
|
|
|
|
|
archive_recursion_depth,
|
|
|
|
|
config,
|
|
|
|
|
postprocess,
|
|
|
|
|
},
|
|
|
|
|
&detection_reason,
|
|
|
|
|
let mut cache = cache.context("No cache?")?;
|
|
|
|
|
let cache_key: Vec<u8> = compute_cache_key(&filepath_hint, adapter.as_ref(), active_adapters)?;
|
|
|
|
|
// let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
|
|
|
|
|
let cached = cache.get(&db_name, &cache_key)?;
|
|
|
|
|
match cached {
|
|
|
|
|
Some(cached) => Ok(Box::pin(
|
|
|
|
|
async_compression::tokio::bufread::ZstdDecoder::new(std::io::Cursor::new(cached)),
|
|
|
|
|
)),
|
|
|
|
|
None => {
|
|
|
|
|
debug!("cache MISS, running adapter");
|
|
|
|
|
debug!("adapting with caching...");
|
|
|
|
|
let inp = adapter
|
|
|
|
|
.adapt(
|
|
|
|
|
AdaptInfo {
|
|
|
|
|
line_prefix,
|
|
|
|
|
filepath_hint: filepath_hint.clone(),
|
|
|
|
|
is_real_file,
|
|
|
|
|
inp,
|
|
|
|
|
archive_recursion_depth,
|
|
|
|
|
config,
|
|
|
|
|
postprocess,
|
|
|
|
|
},
|
|
|
|
|
&detection_reason,
|
|
|
|
|
)
|
|
|
|
|
.with_context(|| {
|
|
|
|
|
format!(
|
|
|
|
|
"adapting {} via {} failed",
|
|
|
|
|
filepath_hint.to_string_lossy(),
|
|
|
|
|
meta.name
|
|
|
|
|
)
|
|
|
|
|
.with_context(|| {
|
|
|
|
|
format!(
|
|
|
|
|
"adapting {} via {} failed",
|
|
|
|
|
filepath_hint.to_string_lossy(),
|
|
|
|
|
meta.name
|
|
|
|
|
)
|
|
|
|
|
})?;
|
|
|
|
|
let inp = RecursingConcattyReader::concat(inp)?;
|
|
|
|
|
let inp = CachingReader::new(
|
|
|
|
|
inp,
|
|
|
|
|
cache_max_blob_len.0.try_into().unwrap(),
|
|
|
|
|
cache_compression_level.0.try_into().unwrap(),
|
|
|
|
|
Box::new(move |(uncompressed_size, compressed)| {
|
|
|
|
|
debug!(
|
|
|
|
|
"uncompressed output: {}",
|
|
|
|
|
print_bytes(uncompressed_size as f64)
|
|
|
|
|
);
|
|
|
|
|
if let Some(cached) = compressed {
|
|
|
|
|
debug!("compressed output: {}", print_bytes(cached.len() as f64));
|
|
|
|
|
cache.set(&db_name, &cache_key, &cached)?
|
|
|
|
|
}
|
|
|
|
|
Ok(())
|
|
|
|
|
}),
|
|
|
|
|
)?;
|
|
|
|
|
})?;
|
|
|
|
|
let inp = concat_read_streams(inp);
|
|
|
|
|
let inp = CachingReader::new(
|
|
|
|
|
inp,
|
|
|
|
|
cache_max_blob_len.0.try_into().unwrap(),
|
|
|
|
|
cache_compression_level.0.try_into().unwrap(),
|
|
|
|
|
Box::new(move |(uncompressed_size, compressed)| {
|
|
|
|
|
debug!(
|
|
|
|
|
"uncompressed output: {}",
|
|
|
|
|
print_bytes(uncompressed_size as f64)
|
|
|
|
|
);
|
|
|
|
|
if let Some(cached) = compressed {
|
|
|
|
|
debug!("compressed output: {}", print_bytes(cached.len() as f64));
|
|
|
|
|
cache.set(&db_name, &cache_key, &cached)?
|
|
|
|
|
}
|
|
|
|
|
Ok(())
|
|
|
|
|
}),
|
|
|
|
|
)?;
|
|
|
|
|
|
|
|
|
|
Ok(Box::new(inp))
|
|
|
|
|
}
|
|
|
|
|
Ok(Box::pin(inp))
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// no cache arc - probably within archive
|
|
|
|
|
debug!("adapting without caching...");
|
|
|
|
|
let start = Instant::now();
|
|
|
|
|
let oread = adapter
|
|
|
|
|
.adapt(
|
|
|
|
|
AdaptInfo {
|
|
|
|
|
line_prefix,
|
|
|
|
|
filepath_hint: filepath_hint.clone(),
|
|
|
|
|
is_real_file,
|
|
|
|
|
inp,
|
|
|
|
|
archive_recursion_depth,
|
|
|
|
|
config,
|
|
|
|
|
postprocess,
|
|
|
|
|
},
|
|
|
|
|
&detection_reason,
|
|
|
|
|
)
|
|
|
|
|
.with_context(|| {
|
|
|
|
|
format!(
|
|
|
|
|
"adapting {} via {} without caching failed",
|
|
|
|
|
filepath_hint.to_string_lossy(),
|
|
|
|
|
meta.name
|
|
|
|
|
)
|
|
|
|
|
})?;
|
|
|
|
|
debug!(
|
|
|
|
|
"running adapter {} took {}",
|
|
|
|
|
adapter.metadata().name,
|
|
|
|
|
print_dur(start)
|
|
|
|
|
);
|
|
|
|
|
Ok(RecursingConcattyReader::concat(oread)?)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|