|
|
|
@ -1,9 +1,10 @@
|
|
|
|
|
use super::*;
|
|
|
|
|
use crate::preproc::rga_preproc;
|
|
|
|
|
|
|
|
|
|
use anyhow::Result;
|
|
|
|
|
use lazy_static::lazy_static;
|
|
|
|
|
use tokio::io::BufReader;
|
|
|
|
|
|
|
|
|
|
use std::path::PathBuf;
|
|
|
|
|
use std::path::{Path, PathBuf};
|
|
|
|
|
|
|
|
|
|
static EXTENSIONS: &[&str] = &["tgz", "tbz", "tbz2", "gz", "bz2", "xz", "zst"];
|
|
|
|
|
static MIME_TYPES: &[&str] = &[
|
|
|
|
@ -49,26 +50,27 @@ impl GetMetadata for DecompressAdapter {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result<ReadBox> {
|
|
|
|
|
use async_compression::tokio::bufread;
|
|
|
|
|
use FastFileMatcher::*;
|
|
|
|
|
use FileMatcher::*;
|
|
|
|
|
let gz = |inp: ReadBox| Box::new(flate2::read::MultiGzDecoder::new(inp));
|
|
|
|
|
let bz2 = |inp: ReadBox| Box::new(bzip2::read::BzDecoder::new(inp));
|
|
|
|
|
let xz = |inp: ReadBox| Box::new(xz2::read::XzDecoder::new_multi_decoder(inp));
|
|
|
|
|
let zst = |inp: ReadBox| zstd::stream::read::Decoder::new(inp); // returns result
|
|
|
|
|
let gz = |inp: ReadBox| Box::pin(bufread::GzipDecoder::new(BufReader::new(inp)));
|
|
|
|
|
let bz2 = |inp: ReadBox| Box::pin(bufread::BzDecoder::new(BufReader::new(inp)));
|
|
|
|
|
let xz = |inp: ReadBox| Box::pin(bufread::XzDecoder::new(BufReader::new(inp)));
|
|
|
|
|
let zst = |inp: ReadBox| Box::pin(bufread::ZstdDecoder::new(BufReader::new(inp)));
|
|
|
|
|
|
|
|
|
|
Ok(match reason {
|
|
|
|
|
Fast(FileExtension(ext)) => match ext.as_ref() {
|
|
|
|
|
"tgz" | "gz" => gz(inp),
|
|
|
|
|
"tbz" | "tbz2" | "bz2" => bz2(inp),
|
|
|
|
|
"xz" => xz(inp),
|
|
|
|
|
"zst" => Box::new(zst(inp)?),
|
|
|
|
|
"zst" => zst(inp),
|
|
|
|
|
ext => Err(format_err!("don't know how to decompress {}", ext))?,
|
|
|
|
|
},
|
|
|
|
|
MimeType(mime) => match mime.as_ref() {
|
|
|
|
|
"application/gzip" => gz(inp),
|
|
|
|
|
"application/x-bzip" => bz2(inp),
|
|
|
|
|
"application/x-xz" => xz(inp),
|
|
|
|
|
"application/zstd" => Box::new(zst(inp)?),
|
|
|
|
|
"application/zstd" => zst(inp),
|
|
|
|
|
mime => Err(format_err!("don't know how to decompress mime {}", mime))?,
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
@ -76,13 +78,13 @@ fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result<ReadBox> {
|
|
|
|
|
fn get_inner_filename(filename: &Path) -> PathBuf {
|
|
|
|
|
let extension = filename
|
|
|
|
|
.extension()
|
|
|
|
|
.map(|e| e.to_string_lossy().to_owned())
|
|
|
|
|
.map(|e| e.to_string_lossy())
|
|
|
|
|
.unwrap_or(Cow::Borrowed(""));
|
|
|
|
|
let stem = filename
|
|
|
|
|
.file_stem()
|
|
|
|
|
.expect("no filename given?")
|
|
|
|
|
.to_string_lossy();
|
|
|
|
|
let new_extension = match extension.to_owned().as_ref() {
|
|
|
|
|
let new_extension = match extension.as_ref() {
|
|
|
|
|
"tgz" | "tbz" | "tbz2" => ".tar",
|
|
|
|
|
_other => "",
|
|
|
|
|
};
|
|
|
|
@ -90,33 +92,27 @@ fn get_inner_filename(filename: &Path) -> PathBuf {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl FileAdapter for DecompressAdapter {
|
|
|
|
|
fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<ReadBox> {
|
|
|
|
|
let AdaptInfo {
|
|
|
|
|
filepath_hint,
|
|
|
|
|
inp,
|
|
|
|
|
line_prefix,
|
|
|
|
|
archive_recursion_depth,
|
|
|
|
|
config,
|
|
|
|
|
..
|
|
|
|
|
} = ai;
|
|
|
|
|
|
|
|
|
|
let ai2: AdaptInfo = AdaptInfo {
|
|
|
|
|
filepath_hint: get_inner_filename(&filepath_hint),
|
|
|
|
|
fn adapt(&self, ai: AdaptInfo, detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
|
|
|
|
|
Ok(Box::pin(tokio_stream::once(AdaptInfo {
|
|
|
|
|
filepath_hint: get_inner_filename(&ai.filepath_hint),
|
|
|
|
|
is_real_file: false,
|
|
|
|
|
archive_recursion_depth: archive_recursion_depth + 1,
|
|
|
|
|
inp: decompress_any(detection_reason, inp)?,
|
|
|
|
|
line_prefix,
|
|
|
|
|
config: config.clone(),
|
|
|
|
|
};
|
|
|
|
|
rga_preproc(ai2)
|
|
|
|
|
archive_recursion_depth: ai.archive_recursion_depth + 1,
|
|
|
|
|
inp: decompress_any(detection_reason, ai.inp)?,
|
|
|
|
|
line_prefix: ai.line_prefix,
|
|
|
|
|
config: ai.config.clone(),
|
|
|
|
|
postprocess: ai.postprocess,
|
|
|
|
|
})))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
use crate::preproc::loop_adapt;
|
|
|
|
|
use crate::test_utils::*;
|
|
|
|
|
use std::fs::File;
|
|
|
|
|
use pretty_assertions::assert_eq;
|
|
|
|
|
use tokio::fs::File;
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_inner_filename() {
|
|
|
|
|
for (a, b) in &[
|
|
|
|
@ -132,38 +128,38 @@ mod tests {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn gz() -> Result<()> {
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn gz() -> Result<()> {
|
|
|
|
|
let adapter = DecompressAdapter;
|
|
|
|
|
|
|
|
|
|
let filepath = test_data_dir().join("hello.gz");
|
|
|
|
|
|
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::new(File::open(&filepath)?));
|
|
|
|
|
let mut r = adapter.adapt(a, &d)?;
|
|
|
|
|
let mut o = Vec::new();
|
|
|
|
|
r.read_to_end(&mut o)?;
|
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
|
|
|
|
let r = adapter.adapt(a, &d)?;
|
|
|
|
|
let o = adapted_to_vec(r).await?;
|
|
|
|
|
assert_eq!(String::from_utf8(o)?, "hello\n");
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn pdf_gz() -> Result<()> {
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn pdf_gz() -> Result<()> {
|
|
|
|
|
let adapter = DecompressAdapter;
|
|
|
|
|
|
|
|
|
|
let filepath = test_data_dir().join("short.pdf.gz");
|
|
|
|
|
|
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::new(File::open(&filepath)?));
|
|
|
|
|
let mut r = adapter.adapt(a, &d)?;
|
|
|
|
|
let mut o = Vec::new();
|
|
|
|
|
r.read_to_end(&mut o)?;
|
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
|
|
|
|
let r = loop_adapt(&adapter, d, a)?;
|
|
|
|
|
let o = adapted_to_vec(r).await?;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
String::from_utf8(o)?,
|
|
|
|
|
"hello world
|
|
|
|
|
this is just a test.
|
|
|
|
|
|
|
|
|
|
1
|
|
|
|
|
|
|
|
|
|
\u{c}"
|
|
|
|
|
"PREFIX:Page 1:hello world
|
|
|
|
|
PREFIX:Page 1:this is just a test.
|
|
|
|
|
PREFIX:Page 1:
|
|
|
|
|
PREFIX:Page 1:1
|
|
|
|
|
PREFIX:Page 1:
|
|
|
|
|
PREFIX:Page 1:
|
|
|
|
|
PREFIX:Page 2:
|
|
|
|
|
"
|
|
|
|
|
);
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|