2020-06-11 12:38:50 +00:00
//trait RunFnAdapter: GetMetadata {}
//impl<T> FileAdapter for T where T: RunFnAdapter {}
2021-08-26 12:54:42 +00:00
use anyhow ::Context ;
2020-06-11 12:38:50 +00:00
use anyhow ::Result ;
2022-12-05 03:30:56 +00:00
use async_stream ::stream ;
2022-11-12 23:31:25 +00:00
use bytes ::Bytes ;
2021-08-26 12:54:42 +00:00
use encoding_rs_io ::DecodeReaderBytesBuilder ;
2022-12-05 03:30:56 +00:00
use std ::cmp ::min ;
2022-12-25 17:27:50 +00:00
use std ::ffi ::OsStr ;
2022-12-05 03:30:56 +00:00
use std ::io ::Cursor ;
2022-12-25 17:27:50 +00:00
use std ::path ::PathBuf ;
2022-12-05 03:30:56 +00:00
use std ::pin ::Pin ;
2022-11-12 23:31:25 +00:00
use tokio ::io ::{ AsyncRead , AsyncReadExt } ;
use tokio_util ::io ::ReaderStream ;
use tokio_util ::io ::StreamReader ;
2020-09-30 14:22:54 +00:00
2022-12-05 03:30:56 +00:00
use crate ::adapted_iter ::AdaptedFilesIterBox ;
2022-12-25 16:37:31 +00:00
use crate ::matching ::FastFileMatcher ;
2020-09-30 14:49:51 +00:00
use super ::{ AdaptInfo , AdapterMeta , FileAdapter , GetMetadata } ;
2020-06-11 12:38:50 +00:00
2022-11-12 23:31:25 +00:00
fn add_newline ( ar : impl AsyncRead + Send ) -> impl AsyncRead + Send {
ar . chain ( Cursor ::new ( & [ b '\n' ] ) )
2020-06-11 12:38:50 +00:00
}
2020-09-30 14:22:54 +00:00
pub struct PostprocPrefix { }
impl GetMetadata for PostprocPrefix {
fn metadata ( & self ) -> & super ::AdapterMeta {
lazy_static ::lazy_static! {
static ref METADATA : AdapterMeta = AdapterMeta {
name : " postprocprefix " . to_owned ( ) ,
version : 1 ,
2021-08-26 12:54:42 +00:00
description : " Adds the line prefix to each line (e.g. the filename within a zip) " . to_owned ( ) ,
2022-12-25 16:37:31 +00:00
recurses : false ,
2020-09-30 14:22:54 +00:00
fast_matchers : vec ! [ ] ,
slow_matchers : None ,
keep_fast_matchers_if_accurate : false ,
disabled_by_default : false
} ;
}
& METADATA
}
}
impl FileAdapter for PostprocPrefix {
fn adapt < ' a > (
& self ,
2022-11-12 23:31:25 +00:00
a : super ::AdaptInfo ,
2020-09-30 14:22:54 +00:00
_detection_reason : & crate ::matching ::FileMatcher ,
2022-11-12 23:31:25 +00:00
) -> Result < AdaptedFilesIterBox > {
let read = add_newline ( postproc_prefix (
2021-08-26 12:54:42 +00:00
& a . line_prefix ,
postproc_encoding ( & a . line_prefix , a . inp ) ? ,
) ) ;
2020-09-30 14:22:54 +00:00
// keep adapt info (filename etc) except replace inp
let ai = AdaptInfo {
2022-11-12 23:31:25 +00:00
inp : Box ::pin ( read ) ,
2020-09-30 14:22:54 +00:00
postprocess : false ,
.. a
} ;
2022-11-12 23:31:25 +00:00
Ok ( Box ::pin ( tokio_stream ::once ( ai ) ) )
2020-09-30 14:22:54 +00:00
}
}
2021-08-26 12:54:42 +00:00
/* struct ReadErr {
err : Fn ( ) -> std ::io ::Error ,
}
impl Read for ReadErr {
fn read ( & mut self , buf : & mut [ u8 ] ) -> std ::io ::Result < usize > {
Err ( self . err ( ) )
}
} * /
2022-11-28 09:46:58 +00:00
/**
* Detects and converts encodings other than utf - 8 to utf - 8.
* If the input stream does not contain valid text , returns the string ` [ rga : binary data ] ` instead
* /
2022-11-12 23:31:25 +00:00
pub fn postproc_encoding (
2022-04-18 20:44:01 +00:00
line_prefix : & str ,
2022-11-12 23:31:25 +00:00
inp : impl AsyncRead + Send + 'static ,
) -> Result < Pin < Box < dyn AsyncRead + Send > > > {
Ok ( Box ::pin ( inp ) )
// panic!("todo: implement");
/* / / TODO: parse these options from ripgrep's configuration
2021-08-26 12:54:42 +00:00
let encoding = None ; // detect bom but usually assume utf8
let bom_sniffing = true ;
let mut decode_builder = DecodeReaderBytesBuilder ::new ( ) ;
// https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
// this detects utf-16 BOMs and transcodes to utf-8 if they are present
// it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
let inp = decode_builder
. encoding ( encoding )
. utf8_passthru ( true )
. strip_bom ( bom_sniffing )
. bom_override ( true )
. bom_sniffing ( bom_sniffing )
. build ( inp ) ;
// check for binary content in first 8kB
// read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
let mut fourk = Vec ::with_capacity ( 1 < < 13 ) ;
let mut beginning = inp . take ( 1 < < 13 ) ;
beginning . read_to_end ( & mut fourk ) ? ;
if fourk . contains ( & 0 u8 ) {
log ::debug! ( " detected binary " ) ;
let v = " [rga: binary data] " ;
2022-04-18 20:44:01 +00:00
return Ok ( Box ::new ( std ::io ::Cursor ::new ( v ) ) ) ;
2021-08-26 12:54:42 +00:00
/* let err = std::io::Error::new(
std ::io ::ErrorKind ::InvalidData ,
format! ( " {} [rga: binary data] " , line_prefix ) ,
) ;
return Err ( err ) . context ( " " ) ;
return ReadErr {
err ,
} ; * /
}
Ok ( Box ::new (
std ::io ::Cursor ::new ( fourk ) . chain ( beginning . into_inner ( ) ) ,
2022-11-12 23:31:25 +00:00
) ) * /
2021-08-26 12:54:42 +00:00
}
2022-12-22 21:15:33 +00:00
/// Adds the given prefix to each line in an `AsyncRead`.
2022-11-12 23:31:25 +00:00
pub fn postproc_prefix ( line_prefix : & str , inp : impl AsyncRead + Send ) -> impl AsyncRead + Send {
let line_prefix_n = format! ( " \n {} " , line_prefix ) ; // clone since we need it later
let line_prefix_o = Bytes ::copy_from_slice ( line_prefix . as_bytes ( ) ) ;
let regex = regex ::bytes ::Regex ::new ( " \n " ) . unwrap ( ) ;
2022-12-22 21:15:33 +00:00
let inp_stream = ReaderStream ::new ( inp ) ;
2022-11-12 23:31:25 +00:00
let oup_stream = stream! {
yield Ok ( line_prefix_o ) ;
for await chunk in inp_stream {
match chunk {
Err ( e ) = > yield Err ( e ) ,
Ok ( chunk ) = > {
if chunk . contains ( & b '\n' ) {
yield Ok ( Bytes ::copy_from_slice ( & regex . replace_all ( & chunk , line_prefix_n . as_bytes ( ) ) ) ) ;
} else {
yield Ok ( chunk ) ;
}
}
}
}
} ;
2022-12-24 21:44:13 +00:00
Box ::pin ( StreamReader ::new ( oup_stream ) )
2020-09-30 14:22:54 +00:00
}
2022-12-25 16:37:31 +00:00
pub struct PostprocPageBreaks { }
2022-12-25 17:27:50 +00:00
impl PostprocPageBreaks {
pub fn new ( ) -> Self {
Self { }
}
}
2022-12-25 16:37:31 +00:00
impl GetMetadata for PostprocPageBreaks {
fn metadata ( & self ) -> & super ::AdapterMeta {
lazy_static ::lazy_static! {
static ref METADATA : AdapterMeta = AdapterMeta {
name : " postprocpagebreaks " . to_owned ( ) ,
version : 1 ,
description : " Adds the page number to each line for an input file that specifies page breaks as ascii page break character " . to_owned ( ) ,
recurses : false ,
2022-12-25 17:05:58 +00:00
fast_matchers : vec ! [ FastFileMatcher ::FileExtension ( " asciipagebreaks " . to_string ( ) ) ] ,
2022-12-25 16:37:31 +00:00
slow_matchers : None ,
keep_fast_matchers_if_accurate : false ,
disabled_by_default : false
} ;
}
& METADATA
}
}
impl FileAdapter for PostprocPageBreaks {
fn adapt < ' a > (
& self ,
a : super ::AdaptInfo ,
_detection_reason : & crate ::matching ::FileMatcher ,
) -> Result < AdaptedFilesIterBox > {
2022-12-25 17:44:52 +00:00
let read = postproc_pagebreaks ( " " , postproc_encoding ( & a . line_prefix , a . inp ) ? ) ;
2022-12-25 16:37:31 +00:00
// keep adapt info (filename etc) except replace inp
let ai = AdaptInfo {
inp : Box ::pin ( read ) ,
2022-12-25 17:44:52 +00:00
postprocess : true ,
2022-12-25 17:27:50 +00:00
archive_recursion_depth : a . archive_recursion_depth + 1 ,
filepath_hint : a
. filepath_hint
. parent ( )
. map ( PathBuf ::from )
. unwrap_or ( PathBuf ::new ( ) )
. join ( a . filepath_hint . file_stem ( ) . unwrap_or ( OsStr ::new ( " " ) ) ) ,
2022-12-25 16:37:31 +00:00
.. a
} ;
Ok ( Box ::pin ( tokio_stream ::once ( ai ) ) )
}
}
2022-12-22 21:15:33 +00:00
/// Adds the prefix "Page N:" to each line,
/// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
/// ASCII form feeds are the page delimiters output by `pdftotext`.
2022-12-25 03:56:57 +00:00
pub fn postproc_pagebreaks (
line_prefix : & str ,
input : impl AsyncRead + Send ,
) -> impl AsyncRead + Send {
let line_prefix_o : String = line_prefix . into ( ) ;
2022-12-24 00:35:37 +00:00
let regex_linefeed = regex ::bytes ::Regex ::new ( r "\x0c" ) . unwrap ( ) ;
let regex_newline = regex ::bytes ::Regex ::new ( " \n " ) . unwrap ( ) ;
let mut page_count : i32 = 1 ;
2022-12-25 03:56:57 +00:00
let mut page_prefix : String = format! ( " Page {} : {} " , page_count , line_prefix_o ) ;
2022-12-24 00:35:37 +00:00
let input_stream = ReaderStream ::new ( input ) ;
let output_stream = stream! {
for await chunk in input_stream {
2022-12-22 21:15:33 +00:00
match chunk {
Err ( e ) = > yield Err ( e ) ,
Ok ( chunk ) = > {
2022-12-24 00:35:37 +00:00
let sub_chunks = regex_linefeed . split ( & chunk ) ;
for sub_chunk in sub_chunks {
// println!("{}", String::from_utf8_lossy(page_prefix.as_bytes()));
yield Ok ( Bytes ::copy_from_slice ( page_prefix . as_bytes ( ) ) ) ;
2022-12-25 03:56:57 +00:00
page_prefix = format! ( " \n Page {} : {} " , page_count , line_prefix_o ) ;
2022-12-24 00:35:37 +00:00
yield Ok ( Bytes ::copy_from_slice ( & regex_newline . replace_all ( & sub_chunk , page_prefix . as_bytes ( ) ) ) ) ;
page_count + = 1 ;
2022-12-25 03:56:57 +00:00
page_prefix = format! ( " \n Page {} : {} " , page_count , line_prefix_o ) ;
2022-12-22 21:15:33 +00:00
}
}
2020-06-11 12:38:50 +00:00
}
2022-12-22 21:15:33 +00:00
}
} ;
2022-12-24 00:35:37 +00:00
Box ::pin ( StreamReader ::new ( output_stream ) )
2020-06-11 12:38:50 +00:00
}
#[ cfg(test) ]
mod tests {
2021-08-26 12:54:42 +00:00
use super ::* ;
2020-06-11 12:38:50 +00:00
use anyhow ::Result ;
2022-12-25 17:05:58 +00:00
use tokio ::pin ;
2022-12-24 00:35:37 +00:00
use tokio_test ::io ::Builder ;
use tokio_test ::io ::Mock ;
#[ tokio::test ]
async fn test_with_pagebreaks ( ) {
let mut output : Vec < u8 > = Vec ::new ( ) ;
let mock : Mock = Builder ::new ( )
. read ( b " Hello \n World \x0c Foo Bar \n \x0c Test " )
. build ( ) ;
2022-12-25 03:56:57 +00:00
let res = postproc_pagebreaks ( " " , mock ) . read_to_end ( & mut output ) . await ;
2022-12-24 00:35:37 +00:00
println! ( " {} " , String ::from_utf8_lossy ( & output ) ) ;
assert! ( matches! ( res , Ok ( _ ) ) ) ;
assert_eq! (
output ,
2022-12-24 21:41:16 +00:00
b " Page 1:Hello \n Page 1:World \n Page 2:Foo Bar \n Page 2: \n Page 3:Test "
2022-12-24 00:35:37 +00:00
) ;
}
2020-06-11 12:38:50 +00:00
2022-12-24 21:44:13 +00:00
#[ tokio::test ]
async fn test_postproc_prefix ( ) {
let mut output : Vec < u8 > = Vec ::new ( ) ;
let mock : Mock = Builder ::new ( ) . read ( b " Hello \n World " ) . build ( ) ;
let res = postproc_prefix ( " prefix: " , mock )
. read_to_end ( & mut output )
. await ;
println! ( " {} " , String ::from_utf8_lossy ( & output ) ) ;
assert! ( matches! ( res , Ok ( _ ) ) ) ;
assert_eq! ( output , b " prefix: Hello \n prefix: World " ) ;
}
2022-12-05 03:30:56 +00:00
async fn test_from_strs (
pagebreaks : bool ,
line_prefix : & str ,
a : & 'static str ,
b : & str ,
) -> Result < ( ) > {
2022-11-12 23:31:25 +00:00
test_from_bytes ( pagebreaks , line_prefix , a . as_bytes ( ) , b ) . await
2021-08-26 12:54:42 +00:00
}
2022-12-05 03:30:56 +00:00
async fn test_from_bytes (
pagebreaks : bool ,
line_prefix : & str ,
a : & 'static [ u8 ] ,
b : & str ,
) -> Result < ( ) > {
2020-06-11 12:38:50 +00:00
let mut oup = Vec ::new ( ) ;
2021-08-26 12:54:42 +00:00
let inp = postproc_encoding ( " " , a ) ? ;
if pagebreaks {
2022-12-05 03:30:56 +00:00
postproc_pagebreaks ( line_prefix , inp )
. read_to_end ( & mut oup )
. await ? ;
2021-08-26 12:54:42 +00:00
} else {
2022-11-12 23:31:25 +00:00
let x = postproc_prefix ( line_prefix , inp ) ;
pin! ( x ) ;
x . read_to_end ( & mut oup ) . await ? ;
2021-08-26 12:54:42 +00:00
}
2020-06-11 12:38:50 +00:00
let c = String ::from_utf8_lossy ( & oup ) ;
if b ! = c {
2021-08-26 12:54:42 +00:00
anyhow ::bail! (
" `{}` \n should be \n `{}` \n but is \n `{}` " ,
String ::from_utf8_lossy ( & a ) ,
b ,
c
) ;
2020-06-11 12:38:50 +00:00
}
Ok ( ( ) )
}
2022-11-12 23:31:25 +00:00
#[ tokio::test ]
async fn post1 ( ) -> Result < ( ) > {
2020-06-11 12:38:50 +00:00
let inp = " What is this \n This is a test \n Foo " ;
2020-09-30 14:22:54 +00:00
let oup = " Page 1:What is this \n Page 1:This is a test \n Page 1:Foo " ;
2020-06-11 12:38:50 +00:00
2022-11-12 23:31:25 +00:00
test_from_strs ( true , " " , inp , oup ) . await ? ;
2020-06-11 12:38:50 +00:00
println! ( " \n \n \n \n " ) ;
let inp = " What is this \n This is a test \n Foo \x0c \n Helloooo \n How are you? \x0c \n Great! " ;
2020-09-30 14:22:54 +00:00
let oup = " Page 1:What is this \n Page 1:This is a test \n Page 1:Foo \n Page 2: \n Page 2:Helloooo \n Page 2:How are you? \n Page 3: \n Page 3:Great! " ;
2020-06-11 12:38:50 +00:00
2022-11-12 23:31:25 +00:00
test_from_strs ( true , " " , inp , oup ) . await ? ;
2021-08-26 12:54:42 +00:00
let inp = " What is this \n This is a test \n Foo \x0c \n Helloooo \n How are you? \x0c \n Great! " ;
let oup = " foo.pdf:What is this \n foo.pdf:This is a test \n foo.pdf:Foo \x0c \n foo.pdf:Helloooo \n foo.pdf:How are you? \x0c \n foo.pdf:Great! " ;
2022-11-12 23:31:25 +00:00
test_from_strs ( false , " foo.pdf: " , inp , oup ) . await ? ;
2021-08-26 12:54:42 +00:00
2022-04-18 20:44:01 +00:00
test_from_strs (
false ,
" foo: " ,
" this is a test \n \n \0 foo " ,
" foo:[rga: binary data] " ,
2022-12-05 03:30:56 +00:00
)
. await ? ;
2022-11-12 23:31:25 +00:00
test_from_strs ( false , " foo: " , " \0 " , " foo:[rga: binary data] " ) . await ? ;
2020-06-11 12:38:50 +00:00
Ok ( ( ) )
}
2021-08-26 12:54:42 +00:00
/* #[test]
fn chardet ( ) -> Result < ( ) > {
let mut d = chardetng ::EncodingDetector ::new ( ) ;
let mut v = Vec ::new ( ) ;
std ::fs ::File ::open ( " /home/phire/passwords-2018.kdbx.old " ) . unwrap ( ) . read_to_end ( & mut v ) . unwrap ( ) ;
d . feed ( & v , false ) ;
println! ( " foo {:?} " , d . guess ( None , true ) ) ;
Ok ( ( ) )
} * /
2020-06-11 12:38:50 +00:00
}