@ -4,12 +4,10 @@ use anyhow::Result;
use async_stream ::stream ;
use async_stream ::stream ;
use lazy_static ::lazy_static ;
use lazy_static ::lazy_static ;
use mime2ext ::mime2ext ;
use mime2ext ::mime2ext ;
use regex ::bytes ::Regex ;
use tokio ::io ::AsyncReadExt ;
use tokio ::io ::AsyncReadExt ;
use std ::{
use std ::{ collections ::VecDeque , io ::Cursor } ;
collections ::VecDeque ,
io ::Cursor ,
} ;
static EXTENSIONS : & [ & str ] = & [ "mbox" , "mbx" , "eml" ] ;
static EXTENSIONS : & [ & str ] = & [ "mbox" , "mbx" , "eml" ] ;
static MIME_TYPES : & [ & str ] = & [ "application/mbox" , "message/rfc822" ] ;
static MIME_TYPES : & [ & str ] = & [ "application/mbox" , "message/rfc822" ] ;
@ -17,8 +15,9 @@ lazy_static! {
static ref METADATA : AdapterMeta = AdapterMeta {
static ref METADATA : AdapterMeta = AdapterMeta {
name : "mail" . to_owned ( ) ,
name : "mail" . to_owned ( ) ,
version : 1 ,
version : 1 ,
description : "Reads mailbox/mail files and runs extractors on the contents and attachments."
description :
. to_owned ( ) ,
"Reads mailbox/mail files and runs extractors on the contents and attachments."
. to_owned ( ) ,
recurses : true ,
recurses : true ,
fast_matchers : EXTENSIONS
fast_matchers : EXTENSIONS
. iter ( )
. iter ( )
@ -33,6 +32,7 @@ lazy_static! {
disabled_by_default : true ,
disabled_by_default : true ,
keep_fast_matchers_if_accurate : true
keep_fast_matchers_if_accurate : true
} ;
} ;
static ref FROM_REGEX : Regex = Regex ::new ( "\r?\nFrom [^\n]+\n" ) . unwrap ( ) ;
}
}
#[ derive(Default) ]
#[ derive(Default) ]
pub struct MboxAdapter ;
pub struct MboxAdapter ;
@ -65,16 +65,18 @@ impl FileAdapter for MboxAdapter {
..
..
} = ai ;
} = ai ;
let mut content = String ::new ( ) ;
let mut content = Vec ::new ( ) ;
let s = stream ! {
let s = stream ! {
inp . read_to_ string ( & mut content ) . await ? ;
inp . read_to_ end ( & mut content ) . await ? ;
let mut ais = vec! [ ] ;
let mut ais = vec! [ ] ;
for mail in content . split ( "\nFrom " ) {
for mail_bytes in FROM_REGEX . splitn ( & content , usize ::MAX ) {
let mail_bytes = mail . as_bytes ( ) ; // &content[offset..offset2];
let mail_content = mail_bytes . splitn ( 2 , | x | * x = = b'\n' ) . skip ( 1 ) . next ( ) . unwrap ( ) ;
let mail_content = mail_bytes . splitn ( 2 , | x | * x = = b'\n' ) . skip ( 1 ) . next ( ) . unwrap ( ) ;
let mail = mailparse ::parse_mail ( mail_content ) ? ;
let mail = mailparse ::parse_mail ( mail_content ) ;
if mail . is_err ( ) {
continue ;
}
let mail = mail . unwrap ( ) ;
let mut todos = VecDeque ::new ( ) ;
let mut todos = VecDeque ::new ( ) ;
todos . push_back ( mail ) ;
todos . push_back ( mail ) ;
@ -101,11 +103,15 @@ impl FileAdapter for MboxAdapter {
let mut config = config . clone ( ) ;
let mut config = config . clone ( ) ;
config . accurate = true ;
config . accurate = true ;
let raw_body = mail . get_body_raw ( ) ;
if raw_body . is_err ( ) {
continue ;
}
let ai2 : AdaptInfo = AdaptInfo {
let ai2 : AdaptInfo = AdaptInfo {
filepath_hint : path ,
filepath_hint : path ,
is_real_file : false ,
is_real_file : false ,
archive_recursion_depth : archive_recursion_depth + 1 ,
archive_recursion_depth : archive_recursion_depth + 1 ,
inp : Box ::pin ( Cursor ::new ( mail. get_body_raw ( ) ? ) ) ,
inp : Box ::pin ( Cursor ::new ( raw_body. unwrap ( ) ) ) ,
line_prefix : line_prefix . to_string ( ) ,
line_prefix : line_prefix . to_string ( ) ,
config : config ,
config : config ,
postprocess ,
postprocess ,
@ -143,10 +149,18 @@ mod tests {
let mut file = file ? ;
let mut file = file ? ;
let mut buf = Vec ::new ( ) ;
let mut buf = Vec ::new ( ) ;
file . inp . read_to_end ( & mut buf ) . await ? ;
file . inp . read_to_end ( & mut buf ) . await ? ;
match file . filepath_hint . components ( ) . last ( ) . unwrap ( ) . as_os_str ( ) . to_str ( ) . unwrap ( ) {
match file
. filepath_hint
. components ( )
. last ( )
. unwrap ( )
. as_os_str ( )
. to_str ( )
. unwrap ( )
{
"data.txt" | "data.html" = > {
"data.txt" | "data.html" = > {
assert! ( String ::from_utf8 ( buf ) ? . contains ( "Thank you for your contribution" ) ) ;
assert! ( String ::from_utf8 ( buf ) ? . contains ( "Thank you for your contribution" ) ) ;
} ,
}
x = > panic! ( "unexpected filename {x:?}" ) ,
x = > panic! ( "unexpected filename {x:?}" ) ,
}
}
count + = 1 ;
count + = 1 ;
@ -181,6 +195,8 @@ mod tests {
#[ tokio::test ]
#[ tokio::test ]
async fn mbox_attachment ( ) -> Result < ( ) > {
async fn mbox_attachment ( ) -> Result < ( ) > {
init_logging ( ) ;
let adapter = MboxAdapter ;
let adapter = MboxAdapter ;
let filepath = test_data_dir ( ) . join ( "mail_with_attachment.mbox" ) ;
let filepath = test_data_dir ( ) . join ( "mail_with_attachment.mbox" ) ;
@ -202,10 +218,13 @@ mod tests {
file . inp . read_to_end ( & mut buf ) . await ? ;
file . inp . read_to_end ( & mut buf ) . await ? ;
match path {
match path {
"data.html.txt" = > {
"data.html.txt" = > {
assert_eq! ( "PREFIX:regular text\nPREFIX:\n" , String ::from_utf8 ( buf ) ? ) ;
assert_eq! (
"PREFIX:regular text\nPREFIX:\n" ,
String ::from_utf8 ( buf ) . unwrap_or ( "err" . to_owned ( ) )
) ;
}
}
"short.pdf.txt" = > {
"short.pdf.txt" = > {
assert_eq! ( "PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n" , String ::from_utf8 ( buf ) ? ) ;
assert_eq! ( "PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n" , String ::from_utf8 ( buf ) . unwrap_or ( "err" . to_owned ( ) ) ) ;
}
}
_ = > {
_ = > {
panic! ( "unrelated {path:?}" ) ;
panic! ( "unrelated {path:?}" ) ;