use super::*; use anyhow::Result; use async_stream::stream; use lazy_static::lazy_static; use mime2ext::mime2ext; use tokio::io::AsyncReadExt; use std::{ collections::VecDeque, io::Cursor, }; static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"]; lazy_static! { static ref METADATA: AdapterMeta = AdapterMeta { name: "mail".to_owned(), version: 1, description: "Reads mailbox/mail files and runs extractors on the contents and attachments." .to_owned(), recurses: true, fast_matchers: EXTENSIONS .iter() .map(|s| FastFileMatcher::FileExtension(s.to_string())) .collect(), slow_matchers: Some( MIME_TYPES .iter() .map(|s| FileMatcher::MimeType(s.to_string())) .collect() ), disabled_by_default: true, keep_fast_matchers_if_accurate: true }; } #[derive(Default)] pub struct MboxAdapter; impl MboxAdapter { pub fn new() -> MboxAdapter { MboxAdapter } } impl GetMetadata for MboxAdapter { fn metadata(&self) -> &AdapterMeta { &METADATA } } #[async_trait] impl FileAdapter for MboxAdapter { async fn adapt( &self, ai: AdaptInfo, _detection_reason: &FileMatcher, ) -> Result { let AdaptInfo { filepath_hint, mut inp, line_prefix, archive_recursion_depth, config, postprocess, .. } = ai; let mut content = String::new(); let s = stream! { inp.read_to_string(&mut content).await?; let mut ais = vec![]; for mail in content.split("\nFrom ") { let mail_bytes = mail.as_bytes(); // &content[offset..offset2]; let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap(); let mail = mailparse::parse_mail(mail_content)?; let mut todos = VecDeque::new(); todos.push_back(mail); while let Some(mail) = todos.pop_front() { let mut path = filepath_hint.clone(); let filename = mail.get_content_disposition().params.get("filename").cloned(); match &*mail.ctype.mimetype { x if x.starts_with("multipart/") => { todos.extend(mail.subparts); continue; } mime => { if let Some(name) = filename { path.push(name); } else if let Some(extension) = mime2ext(mime) { path.push(format!("data.{extension}")); } else { path.push("data"); } } } let mut config = config.clone(); config.accurate = true; let ai2: AdaptInfo = AdaptInfo { filepath_hint: path, is_real_file: false, archive_recursion_depth: archive_recursion_depth + 1, inp: Box::pin(Cursor::new(mail.get_body_raw()?)), line_prefix: line_prefix.to_string(), config: config, postprocess, }; ais.push(ai2); } } for a in ais { yield(Ok(a)); } }; Ok(Box::pin(s)) } } #[cfg(test)] mod tests { use super::*; use crate::preproc::loop_adapt; use crate::test_utils::*; use pretty_assertions::assert_eq; use tokio::fs::File; use tokio_stream::StreamExt; #[tokio::test] async fn mail_simple() -> Result<()> { let adapter = MboxAdapter; let filepath = test_data_dir().join("github_email.eml"); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let mut r = adapter.adapt(a, &d).await?; let mut count = 0; while let Some(file) = r.next().await { let mut file = file?; let mut buf = Vec::new(); file.inp.read_to_end(&mut buf).await?; match file.filepath_hint.components().last().unwrap().as_os_str().to_str().unwrap() { "data.txt" | "data.html" => { assert!(String::from_utf8(buf)?.contains("Thank you for your contribution")); }, x => panic!("unexpected filename {x:?}"), } count += 1; } assert_eq!(2, count); Ok(()) } #[tokio::test] async fn mbox_simple() -> Result<()> { let adapter = MboxAdapter; let filepath = test_data_dir().join("test.mbx"); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let mut r = adapter.adapt(a, &d).await?; let mut count = 0; while let Some(file) = r.next().await { let mut file = file?; assert_eq!( "data.html", file.filepath_hint.components().last().unwrap().as_os_str() ); let mut buf = Vec::new(); file.inp.read_to_end(&mut buf).await?; assert_eq!("\r\n \r\n \r\n \r\n \r\n

>From

\r\n

Another word >From
\r\n

\r\n \r\n", String::from_utf8(buf)?.trim()); count += 1; } assert_eq!(3, count); Ok(()) } #[tokio::test] async fn mbox_attachment() -> Result<()> { let adapter = MboxAdapter; let filepath = test_data_dir().join("mail_with_attachment.mbox"); let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); let mut r = loop_adapt(&adapter, d, a).await?; let mut count = 0; while let Some(file) = r.next().await { let mut file = file?; let path = file .filepath_hint .components() .last() .unwrap() .as_os_str() .to_str() .unwrap(); let mut buf = Vec::new(); file.inp.read_to_end(&mut buf).await?; match path { "data.html.txt" => { assert_eq!("PREFIX:regular text\nPREFIX:\n", String::from_utf8(buf)?); } "short.pdf.txt" => { assert_eq!("PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", String::from_utf8(buf)?); } _ => { panic!("unrelated {path:?}"); } } count += 1; } assert_eq!(2, count); // one message + one attachment Ok(()) } }