You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
220 lines
7.1 KiB
Rust
220 lines
7.1 KiB
Rust
use super::*;
|
|
|
|
use anyhow::Result;
|
|
use async_stream::stream;
|
|
use lazy_static::lazy_static;
|
|
use mime2ext::mime2ext;
|
|
use tokio::io::AsyncReadExt;
|
|
|
|
use std::{
|
|
collections::VecDeque,
|
|
io::Cursor,
|
|
};
|
|
|
|
static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"];
|
|
static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"];
|
|
lazy_static! {
|
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
|
name: "mail".to_owned(),
|
|
version: 1,
|
|
description: "Reads mailbox/mail files and runs extractors on the contents and attachments."
|
|
.to_owned(),
|
|
recurses: true,
|
|
fast_matchers: EXTENSIONS
|
|
.iter()
|
|
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
|
|
.collect(),
|
|
slow_matchers: Some(
|
|
MIME_TYPES
|
|
.iter()
|
|
.map(|s| FileMatcher::MimeType(s.to_string()))
|
|
.collect()
|
|
),
|
|
disabled_by_default: true,
|
|
keep_fast_matchers_if_accurate: true
|
|
};
|
|
}
|
|
#[derive(Default)]
|
|
pub struct MboxAdapter;
|
|
|
|
impl MboxAdapter {
|
|
pub fn new() -> MboxAdapter {
|
|
MboxAdapter
|
|
}
|
|
}
|
|
impl GetMetadata for MboxAdapter {
|
|
fn metadata(&self) -> &AdapterMeta {
|
|
&METADATA
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl FileAdapter for MboxAdapter {
|
|
async fn adapt(
|
|
&self,
|
|
ai: AdaptInfo,
|
|
_detection_reason: &FileMatcher,
|
|
) -> Result<AdaptedFilesIterBox> {
|
|
let AdaptInfo {
|
|
filepath_hint,
|
|
mut inp,
|
|
line_prefix,
|
|
archive_recursion_depth,
|
|
config,
|
|
postprocess,
|
|
..
|
|
} = ai;
|
|
|
|
let mut content = String::new();
|
|
let s = stream! {
|
|
inp.read_to_string(&mut content).await?;
|
|
|
|
let mut ais = vec![];
|
|
for mail in content.split("\nFrom ") {
|
|
|
|
let mail_bytes = mail.as_bytes(); // &content[offset..offset2];
|
|
let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap();
|
|
let mail = mailparse::parse_mail(mail_content)?;
|
|
|
|
let mut todos = VecDeque::new();
|
|
todos.push_back(mail);
|
|
|
|
while let Some(mail) = todos.pop_front() {
|
|
let mut path = filepath_hint.clone();
|
|
let filename = mail.get_content_disposition().params.get("filename").cloned();
|
|
match &*mail.ctype.mimetype {
|
|
x if x.starts_with("multipart/") => {
|
|
todos.extend(mail.subparts);
|
|
continue;
|
|
}
|
|
mime => {
|
|
if let Some(name) = filename {
|
|
path.push(name);
|
|
} else if let Some(extension) = mime2ext(mime) {
|
|
path.push(format!("data.{extension}"));
|
|
} else {
|
|
path.push("data");
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut config = config.clone();
|
|
config.accurate = true;
|
|
|
|
let ai2: AdaptInfo = AdaptInfo {
|
|
filepath_hint: path,
|
|
is_real_file: false,
|
|
archive_recursion_depth: archive_recursion_depth + 1,
|
|
inp: Box::pin(Cursor::new(mail.get_body_raw()?)),
|
|
line_prefix: line_prefix.to_string(),
|
|
config: config,
|
|
postprocess,
|
|
};
|
|
ais.push(ai2);
|
|
}
|
|
}
|
|
for a in ais {
|
|
yield(Ok(a));
|
|
}
|
|
};
|
|
Ok(Box::pin(s))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::preproc::loop_adapt;
|
|
use crate::test_utils::*;
|
|
use pretty_assertions::assert_eq;
|
|
use tokio::fs::File;
|
|
use tokio_stream::StreamExt;
|
|
|
|
#[tokio::test]
|
|
async fn mail_simple() -> Result<()> {
|
|
let adapter = MboxAdapter;
|
|
|
|
let filepath = test_data_dir().join("github_email.eml");
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
|
let mut r = adapter.adapt(a, &d).await?;
|
|
let mut count = 0;
|
|
while let Some(file) = r.next().await {
|
|
let mut file = file?;
|
|
let mut buf = Vec::new();
|
|
file.inp.read_to_end(&mut buf).await?;
|
|
match file.filepath_hint.components().last().unwrap().as_os_str().to_str().unwrap() {
|
|
"data.txt" | "data.html" => {
|
|
assert!(String::from_utf8(buf)?.contains("Thank you for your contribution"));
|
|
},
|
|
x => panic!("unexpected filename {x:?}"),
|
|
}
|
|
count += 1;
|
|
}
|
|
assert_eq!(2, count);
|
|
Ok(())
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn mbox_simple() -> Result<()> {
|
|
let adapter = MboxAdapter;
|
|
|
|
let filepath = test_data_dir().join("test.mbx");
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
|
let mut r = adapter.adapt(a, &d).await?;
|
|
let mut count = 0;
|
|
while let Some(file) = r.next().await {
|
|
let mut file = file?;
|
|
assert_eq!(
|
|
"data.html",
|
|
file.filepath_hint.components().last().unwrap().as_os_str()
|
|
);
|
|
let mut buf = Vec::new();
|
|
file.inp.read_to_end(&mut buf).await?;
|
|
assert_eq!("<html>\r\n <head>\r\n <meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\">\r\n </head>\r\n <body>\r\n <p>>From</p>\r\n <p>Another word >From<br>\r\n </p>\r\n </body>\r\n</html>", String::from_utf8(buf)?.trim());
|
|
count += 1;
|
|
}
|
|
assert_eq!(3, count);
|
|
Ok(())
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn mbox_attachment() -> Result<()> {
|
|
let adapter = MboxAdapter;
|
|
|
|
let filepath = test_data_dir().join("mail_with_attachment.mbox");
|
|
|
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
|
let mut r = loop_adapt(&adapter, d, a).await?;
|
|
let mut count = 0;
|
|
while let Some(file) = r.next().await {
|
|
let mut file = file?;
|
|
let path = file
|
|
.filepath_hint
|
|
.components()
|
|
.last()
|
|
.unwrap()
|
|
.as_os_str()
|
|
.to_str()
|
|
.unwrap();
|
|
let mut buf = Vec::new();
|
|
file.inp.read_to_end(&mut buf).await?;
|
|
match path {
|
|
"data.html.txt" => {
|
|
assert_eq!("PREFIX:regular text\nPREFIX:\n", String::from_utf8(buf)?);
|
|
}
|
|
"short.pdf.txt" => {
|
|
assert_eq!("PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", String::from_utf8(buf)?);
|
|
}
|
|
_ => {
|
|
panic!("unrelated {path:?}");
|
|
}
|
|
}
|
|
count += 1;
|
|
}
|
|
assert_eq!(2, count); // one message + one attachment
|
|
Ok(())
|
|
}
|
|
}
|