use super ::* ;
use anyhow ::Result ;
use async_stream ::stream ;
use lazy_static ::lazy_static ;
use mime2ext ::mime2ext ;
use tokio ::io ::AsyncReadExt ;
use std ::{
collections ::VecDeque ,
io ::Cursor ,
} ;
static EXTENSIONS : & [ & str ] = & [ "mbox" , "mbx" , "eml" ] ;
static MIME_TYPES : & [ & str ] = & [ "application/mbox" , "message/rfc822" ] ;
lazy_static ! {
static ref METADATA : AdapterMeta = AdapterMeta {
name : "mail" . to_owned ( ) ,
version : 1 ,
description : "Reads mailbox/mail files and runs extractors on the contents and attachments."
. to_owned ( ) ,
recurses : true ,
fast_matchers : EXTENSIONS
. iter ( )
. map ( | s | FastFileMatcher ::FileExtension ( s . to_string ( ) ) )
. collect ( ) ,
slow_matchers : Some (
MIME_TYPES
. iter ( )
. map ( | s | FileMatcher ::MimeType ( s . to_string ( ) ) )
. collect ( )
) ,
disabled_by_default : true ,
keep_fast_matchers_if_accurate : true
} ;
}
#[ derive(Default) ]
pub struct MboxAdapter ;
impl MboxAdapter {
pub fn new ( ) -> MboxAdapter {
MboxAdapter
}
}
impl GetMetadata for MboxAdapter {
fn metadata ( & self ) -> & AdapterMeta {
& METADATA
}
}
#[ async_trait ]
impl FileAdapter for MboxAdapter {
async fn adapt (
& self ,
ai : AdaptInfo ,
_detection_reason : & FileMatcher ,
) -> Result < AdaptedFilesIterBox > {
let AdaptInfo {
filepath_hint ,
mut inp ,
line_prefix ,
archive_recursion_depth ,
config ,
postprocess ,
..
} = ai ;
let mut content = String ::new ( ) ;
let s = stream ! {
inp . read_to_string ( & mut content ) . await ? ;
let mut ais = vec! [ ] ;
for mail in content . split ( "\nFrom " ) {
let mail_bytes = mail . as_bytes ( ) ; // &content[offset..offset2];
let mail_content = mail_bytes . splitn ( 2 , | x | * x = = b'\n' ) . skip ( 1 ) . next ( ) . unwrap ( ) ;
let mail = mailparse ::parse_mail ( mail_content ) ? ;
let mut todos = VecDeque ::new ( ) ;
todos . push_back ( mail ) ;
while let Some ( mail ) = todos . pop_front ( ) {
let mut path = filepath_hint . clone ( ) ;
let filename = mail . get_content_disposition ( ) . params . get ( "filename" ) . cloned ( ) ;
match & * mail . ctype . mimetype {
x if x . starts_with ( "multipart/" ) = > {
todos . extend ( mail . subparts ) ;
continue ;
}
mime = > {
if let Some ( name ) = filename {
path . push ( name ) ;
} else if let Some ( extension ) = mime2ext ( mime ) {
path . push ( format! ( "data.{extension}" ) ) ;
} else {
path . push ( "data" ) ;
}
}
}
let mut config = config . clone ( ) ;
config . accurate = true ;
let ai2 : AdaptInfo = AdaptInfo {
filepath_hint : path ,
is_real_file : false ,
archive_recursion_depth : archive_recursion_depth + 1 ,
inp : Box ::pin ( Cursor ::new ( mail . get_body_raw ( ) ? ) ) ,
line_prefix : line_prefix . to_string ( ) ,
config : config ,
postprocess ,
} ;
ais . push ( ai2 ) ;
}
}
for a in ais {
yield ( Ok ( a ) ) ;
}
} ;
Ok ( Box ::pin ( s ) )
}
}
#[ cfg(test) ]
mod tests {
use super ::* ;
use crate ::preproc ::loop_adapt ;
use crate ::test_utils ::* ;
use pretty_assertions ::assert_eq ;
use tokio ::fs ::File ;
use tokio_stream ::StreamExt ;
#[ tokio::test ]
async fn mail_simple ( ) -> Result < ( ) > {
let adapter = MboxAdapter ;
let filepath = test_data_dir ( ) . join ( "github_email.eml" ) ;
let ( a , d ) = simple_adapt_info ( & filepath , Box ::pin ( File ::open ( & filepath ) . await ? ) ) ;
let mut r = adapter . adapt ( a , & d ) . await ? ;
let mut count = 0 ;
while let Some ( file ) = r . next ( ) . await {
let mut file = file ? ;
let mut buf = Vec ::new ( ) ;
file . inp . read_to_end ( & mut buf ) . await ? ;
match file . filepath_hint . components ( ) . last ( ) . unwrap ( ) . as_os_str ( ) . to_str ( ) . unwrap ( ) {
"data.txt" | "data.html" = > {
assert! ( String ::from_utf8 ( buf ) ? . contains ( "Thank you for your contribution" ) ) ;
} ,
x = > panic! ( "unexpected filename {x:?}" ) ,
}
count + = 1 ;
}
assert_eq! ( 2 , count ) ;
Ok ( ( ) )
}
#[ tokio::test ]
async fn mbox_simple ( ) -> Result < ( ) > {
let adapter = MboxAdapter ;
let filepath = test_data_dir ( ) . join ( "test.mbx" ) ;
let ( a , d ) = simple_adapt_info ( & filepath , Box ::pin ( File ::open ( & filepath ) . await ? ) ) ;
let mut r = adapter . adapt ( a , & d ) . await ? ;
let mut count = 0 ;
while let Some ( file ) = r . next ( ) . await {
let mut file = file ? ;
assert_eq! (
"data.html" ,
file . filepath_hint . components ( ) . last ( ) . unwrap ( ) . as_os_str ( )
) ;
let mut buf = Vec ::new ( ) ;
file . inp . read_to_end ( & mut buf ) . await ? ;
assert_eq! ( "<html>\r\n <head>\r\n <meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\">\r\n </head>\r\n <body>\r\n <p>>From</p>\r\n <p>Another word >From<br>\r\n </p>\r\n </body>\r\n</html>" , String ::from_utf8 ( buf ) ? . trim ( ) ) ;
count + = 1 ;
}
assert_eq! ( 3 , count ) ;
Ok ( ( ) )
}
#[ tokio::test ]
async fn mbox_attachment ( ) -> Result < ( ) > {
let adapter = MboxAdapter ;
let filepath = test_data_dir ( ) . join ( "mail_with_attachment.mbox" ) ;
let ( a , d ) = simple_adapt_info ( & filepath , Box ::pin ( File ::open ( & filepath ) . await ? ) ) ;
let mut r = loop_adapt ( & adapter , d , a ) . await ? ;
let mut count = 0 ;
while let Some ( file ) = r . next ( ) . await {
let mut file = file ? ;
let path = file
. filepath_hint
. components ( )
. last ( )
. unwrap ( )
. as_os_str ( )
. to_str ( )
. unwrap ( ) ;
let mut buf = Vec ::new ( ) ;
file . inp . read_to_end ( & mut buf ) . await ? ;
match path {
"data.html.txt" = > {
assert_eq! ( "PREFIX:regular text\nPREFIX:\n" , String ::from_utf8 ( buf ) ? ) ;
}
"short.pdf.txt" = > {
assert_eq! ( "PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n" , String ::from_utf8 ( buf ) ? ) ;
}
_ = > {
panic! ( "unrelated {path:?}" ) ;
}
}
count + = 1 ;
}
assert_eq! ( 2 , count ) ; // one message + one attachment
Ok ( ( ) )
}
}