ripgrep-all/src/adapters/postproc.rs

345 lines
12 KiB
Rust
Raw Normal View History

2020-06-11 12:38:50 +00:00
//trait RunFnAdapter: GetMetadata {}
//impl<T> FileAdapter for T where T: RunFnAdapter {}
2021-08-26 12:54:42 +00:00
use anyhow::Context;
2020-06-11 12:38:50 +00:00
use anyhow::Result;
2022-12-05 03:30:56 +00:00
use async_stream::stream;
2022-11-12 23:31:25 +00:00
use bytes::Bytes;
2021-08-26 12:54:42 +00:00
use encoding_rs_io::DecodeReaderBytesBuilder;
2022-12-05 03:30:56 +00:00
use std::cmp::min;
2022-12-25 17:27:50 +00:00
use std::ffi::OsStr;
2022-12-05 03:30:56 +00:00
use std::io::Cursor;
2022-12-25 17:27:50 +00:00
use std::path::PathBuf;
2022-12-05 03:30:56 +00:00
use std::pin::Pin;
2022-11-12 23:31:25 +00:00
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_util::io::ReaderStream;
use tokio_util::io::StreamReader;
2020-09-30 14:22:54 +00:00
2022-12-05 03:30:56 +00:00
use crate::adapted_iter::AdaptedFilesIterBox;
2022-12-25 16:37:31 +00:00
use crate::matching::FastFileMatcher;
2020-09-30 14:49:51 +00:00
use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata};
2020-06-11 12:38:50 +00:00
2022-11-12 23:31:25 +00:00
fn add_newline(ar: impl AsyncRead + Send) -> impl AsyncRead + Send {
ar.chain(Cursor::new(&[b'\n']))
2020-06-11 12:38:50 +00:00
}
2020-09-30 14:22:54 +00:00
pub struct PostprocPrefix {}
impl GetMetadata for PostprocPrefix {
fn metadata(&self) -> &super::AdapterMeta {
lazy_static::lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "postprocprefix".to_owned(),
version: 1,
2021-08-26 12:54:42 +00:00
description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(),
2022-12-25 16:37:31 +00:00
recurses: false,
2020-09-30 14:22:54 +00:00
fast_matchers: vec![],
slow_matchers: None,
keep_fast_matchers_if_accurate: false,
disabled_by_default: false
};
}
&METADATA
}
}
impl FileAdapter for PostprocPrefix {
fn adapt<'a>(
&self,
2022-11-12 23:31:25 +00:00
a: super::AdaptInfo,
2020-09-30 14:22:54 +00:00
_detection_reason: &crate::matching::FileMatcher,
2022-11-12 23:31:25 +00:00
) -> Result<AdaptedFilesIterBox> {
let read = add_newline(postproc_prefix(
2021-08-26 12:54:42 +00:00
&a.line_prefix,
postproc_encoding(&a.line_prefix, a.inp)?,
));
2020-09-30 14:22:54 +00:00
// keep adapt info (filename etc) except replace inp
let ai = AdaptInfo {
2022-11-12 23:31:25 +00:00
inp: Box::pin(read),
2020-09-30 14:22:54 +00:00
postprocess: false,
..a
};
2022-11-12 23:31:25 +00:00
Ok(Box::pin(tokio_stream::once(ai)))
2020-09-30 14:22:54 +00:00
}
}
2021-08-26 12:54:42 +00:00
/*struct ReadErr {
err: Fn() -> std::io::Error,
}
impl Read for ReadErr {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
Err(self.err())
}
}*/
2022-11-28 09:46:58 +00:00
/**
* Detects and converts encodings other than utf-8 to utf-8.
* If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
*/
2022-11-12 23:31:25 +00:00
pub fn postproc_encoding(
2022-04-18 20:44:01 +00:00
line_prefix: &str,
2022-11-12 23:31:25 +00:00
inp: impl AsyncRead + Send + 'static,
) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
Ok(Box::pin(inp))
// panic!("todo: implement");
/*// TODO: parse these options from ripgrep's configuration
2021-08-26 12:54:42 +00:00
let encoding = None; // detect bom but usually assume utf8
let bom_sniffing = true;
let mut decode_builder = DecodeReaderBytesBuilder::new();
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let inp = decode_builder
.encoding(encoding)
.utf8_passthru(true)
.strip_bom(bom_sniffing)
.bom_override(true)
.bom_sniffing(bom_sniffing)
.build(inp);
// check for binary content in first 8kB
// read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
let mut fourk = Vec::with_capacity(1 << 13);
let mut beginning = inp.take(1 << 13);
beginning.read_to_end(&mut fourk)?;
if fourk.contains(&0u8) {
log::debug!("detected binary");
let v = "[rga: binary data]";
2022-04-18 20:44:01 +00:00
return Ok(Box::new(std::io::Cursor::new(v)));
2021-08-26 12:54:42 +00:00
/*let err = std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("{}[rga: binary data]", line_prefix),
);
return Err(err).context("");
return ReadErr {
err,
};*/
}
Ok(Box::new(
std::io::Cursor::new(fourk).chain(beginning.into_inner()),
2022-11-12 23:31:25 +00:00
))*/
2021-08-26 12:54:42 +00:00
}
2022-12-22 21:15:33 +00:00
/// Adds the given prefix to each line in an `AsyncRead`.
2022-11-12 23:31:25 +00:00
pub fn postproc_prefix(line_prefix: &str, inp: impl AsyncRead + Send) -> impl AsyncRead + Send {
let line_prefix_n = format!("\n{}", line_prefix); // clone since we need it later
let line_prefix_o = Bytes::copy_from_slice(line_prefix.as_bytes());
let regex = regex::bytes::Regex::new("\n").unwrap();
2022-12-22 21:15:33 +00:00
let inp_stream = ReaderStream::new(inp);
2022-11-12 23:31:25 +00:00
let oup_stream = stream! {
yield Ok(line_prefix_o);
for await chunk in inp_stream {
match chunk {
Err(e) => yield Err(e),
Ok(chunk) => {
if chunk.contains(&b'\n') {
yield Ok(Bytes::copy_from_slice(&regex.replace_all(&chunk, line_prefix_n.as_bytes())));
} else {
yield Ok(chunk);
}
}
}
}
};
Box::pin(StreamReader::new(oup_stream))
2020-09-30 14:22:54 +00:00
}
2022-12-25 16:37:31 +00:00
pub struct PostprocPageBreaks {}
2022-12-25 17:27:50 +00:00
impl PostprocPageBreaks {
pub fn new() -> Self {
Self {}
}
}
2022-12-25 16:37:31 +00:00
impl GetMetadata for PostprocPageBreaks {
fn metadata(&self) -> &super::AdapterMeta {
lazy_static::lazy_static! {
static ref METADATA: AdapterMeta = AdapterMeta {
name: "postprocpagebreaks".to_owned(),
version: 1,
description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character".to_owned(),
recurses: false,
fast_matchers: vec![FastFileMatcher::FileExtension("asciipagebreaks".to_string())],
2022-12-25 16:37:31 +00:00
slow_matchers: None,
keep_fast_matchers_if_accurate: false,
disabled_by_default: false
};
}
&METADATA
}
}
impl FileAdapter for PostprocPageBreaks {
fn adapt<'a>(
&self,
a: super::AdaptInfo,
_detection_reason: &crate::matching::FileMatcher,
) -> Result<AdaptedFilesIterBox> {
2022-12-25 17:44:52 +00:00
let read = postproc_pagebreaks("", postproc_encoding(&a.line_prefix, a.inp)?);
2022-12-25 16:37:31 +00:00
// keep adapt info (filename etc) except replace inp
let ai = AdaptInfo {
inp: Box::pin(read),
2022-12-25 17:44:52 +00:00
postprocess: true,
2022-12-25 17:27:50 +00:00
archive_recursion_depth: a.archive_recursion_depth + 1,
filepath_hint: a
.filepath_hint
.parent()
.map(PathBuf::from)
.unwrap_or(PathBuf::new())
.join(a.filepath_hint.file_stem().unwrap_or(OsStr::new(""))),
2022-12-25 16:37:31 +00:00
..a
};
Ok(Box::pin(tokio_stream::once(ai)))
}
}
2022-12-22 21:15:33 +00:00
/// Adds the prefix "Page N:" to each line,
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
/// ASCII form feeds are the page delimiters output by `pdftotext`.
pub fn postproc_pagebreaks(
line_prefix: &str,
input: impl AsyncRead + Send,
) -> impl AsyncRead + Send {
let line_prefix_o: String = line_prefix.into();
let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
let regex_newline = regex::bytes::Regex::new("\n").unwrap();
let mut page_count: i32 = 1;
let mut page_prefix: String = format!("Page {}:{}", page_count, line_prefix_o);
let input_stream = ReaderStream::new(input);
let output_stream = stream! {
for await chunk in input_stream {
2022-12-22 21:15:33 +00:00
match chunk {
Err(e) => yield Err(e),
Ok(chunk) => {
let sub_chunks = regex_linefeed.split(&chunk);
for sub_chunk in sub_chunks {
// println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
yield Ok(Bytes::copy_from_slice(page_prefix.as_bytes()));
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
yield Ok(Bytes::copy_from_slice(&regex_newline.replace_all(&sub_chunk, page_prefix.as_bytes())));
page_count += 1;
page_prefix = format!("\nPage {}:{}", page_count, line_prefix_o);
2022-12-22 21:15:33 +00:00
}
}
2020-06-11 12:38:50 +00:00
}
2022-12-22 21:15:33 +00:00
}
};
Box::pin(StreamReader::new(output_stream))
2020-06-11 12:38:50 +00:00
}
#[cfg(test)]
mod tests {
2021-08-26 12:54:42 +00:00
use super::*;
2020-06-11 12:38:50 +00:00
use anyhow::Result;
use tokio::pin;
use tokio_test::io::Builder;
use tokio_test::io::Mock;
#[tokio::test]
async fn test_with_pagebreaks() {
let mut output: Vec<u8> = Vec::new();
let mock: Mock = Builder::new()
.read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest")
.build();
let res = postproc_pagebreaks("", mock).read_to_end(&mut output).await;
println!("{}", String::from_utf8_lossy(&output));
assert!(matches!(res, Ok(_)));
assert_eq!(
output,
b"Page 1:Hello\nPage 1:World\nPage 2:Foo Bar\nPage 2:\nPage 3:Test"
);
}
2020-06-11 12:38:50 +00:00
#[tokio::test]
async fn test_postproc_prefix() {
let mut output: Vec<u8> = Vec::new();
let mock: Mock = Builder::new().read(b"Hello\nWorld").build();
let res = postproc_prefix("prefix: ", mock)
.read_to_end(&mut output)
.await;
println!("{}", String::from_utf8_lossy(&output));
assert!(matches!(res, Ok(_)));
assert_eq!(output, b"prefix: Hello\nprefix: World");
}
2022-12-05 03:30:56 +00:00
async fn test_from_strs(
pagebreaks: bool,
line_prefix: &str,
a: &'static str,
b: &str,
) -> Result<()> {
2022-11-12 23:31:25 +00:00
test_from_bytes(pagebreaks, line_prefix, a.as_bytes(), b).await
2021-08-26 12:54:42 +00:00
}
2022-12-05 03:30:56 +00:00
async fn test_from_bytes(
pagebreaks: bool,
line_prefix: &str,
a: &'static [u8],
b: &str,
) -> Result<()> {
2020-06-11 12:38:50 +00:00
let mut oup = Vec::new();
2021-08-26 12:54:42 +00:00
let inp = postproc_encoding("", a)?;
if pagebreaks {
2022-12-05 03:30:56 +00:00
postproc_pagebreaks(line_prefix, inp)
.read_to_end(&mut oup)
.await?;
2021-08-26 12:54:42 +00:00
} else {
2022-11-12 23:31:25 +00:00
let x = postproc_prefix(line_prefix, inp);
pin!(x);
x.read_to_end(&mut oup).await?;
2021-08-26 12:54:42 +00:00
}
2020-06-11 12:38:50 +00:00
let c = String::from_utf8_lossy(&oup);
if b != c {
2021-08-26 12:54:42 +00:00
anyhow::bail!(
"`{}`\nshould be\n`{}`\nbut is\n`{}`",
String::from_utf8_lossy(&a),
b,
c
);
2020-06-11 12:38:50 +00:00
}
Ok(())
}
2022-11-12 23:31:25 +00:00
#[tokio::test]
async fn post1() -> Result<()> {
2020-06-11 12:38:50 +00:00
let inp = "What is this\nThis is a test\nFoo";
2020-09-30 14:22:54 +00:00
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo";
2020-06-11 12:38:50 +00:00
2022-11-12 23:31:25 +00:00
test_from_strs(true, "", inp, oup).await?;
2020-06-11 12:38:50 +00:00
println!("\n\n\n\n");
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
2020-09-30 14:22:54 +00:00
let oup = "Page 1:What is this\nPage 1:This is a test\nPage 1:Foo\nPage 2:\nPage 2:Helloooo\nPage 2:How are you?\nPage 3:\nPage 3:Great!";
2020-06-11 12:38:50 +00:00
2022-11-12 23:31:25 +00:00
test_from_strs(true, "", inp, oup).await?;
2021-08-26 12:54:42 +00:00
let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
let oup = "foo.pdf:What is this\nfoo.pdf:This is a test\nfoo.pdf:Foo\x0c\nfoo.pdf:Helloooo\nfoo.pdf:How are you?\x0c\nfoo.pdf:Great!";
2022-11-12 23:31:25 +00:00
test_from_strs(false, "foo.pdf:", inp, oup).await?;
2021-08-26 12:54:42 +00:00
2022-04-18 20:44:01 +00:00
test_from_strs(
false,
"foo:",
"this is a test \n\n \0 foo",
"foo:[rga: binary data]",
2022-12-05 03:30:56 +00:00
)
.await?;
2022-11-12 23:31:25 +00:00
test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
2020-06-11 12:38:50 +00:00
Ok(())
}
2021-08-26 12:54:42 +00:00
/*#[test]
fn chardet() -> Result<()> {
let mut d = chardetng::EncodingDetector::new();
let mut v = Vec::new();
std::fs::File::open("/home/phire/passwords-2018.kdbx.old").unwrap().read_to_end(&mut v).unwrap();
d.feed(&v, false);
println!("foo {:?}", d.guess(None, true));
Ok(())
}*/
2020-06-11 12:38:50 +00:00
}