You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
185 lines
5.2 KiB
Rust
185 lines
5.2 KiB
Rust
1 year ago
|
use crate::adapted_iter::one_file;
|
||
|
|
||
|
use super::*;
|
||
|
|
||
|
use anyhow::Result;
|
||
|
use async_stream::stream;
|
||
|
use lazy_static::lazy_static;
|
||
|
use tokio::io::{BufReader, AsyncReadExt};
|
||
|
|
||
|
use std::{path::{Path, PathBuf}, sync::Mutex, io::Cursor};
|
||
|
|
||
|
static EXTENSIONS: &[&str] = &["mbox", "mbx"];
|
||
|
static MIME_TYPES: &[&str] = &[
|
||
|
"application/mbox",
|
||
|
];
|
||
|
lazy_static! {
|
||
|
static ref METADATA: AdapterMeta = AdapterMeta {
|
||
|
name: "mbox".to_owned(),
|
||
|
version: 1,
|
||
|
description:
|
||
|
"Reads mailbox files and runs extractors on the contents and attachments."
|
||
|
.to_owned(),
|
||
|
recurses: true,
|
||
|
fast_matchers: EXTENSIONS
|
||
|
.iter()
|
||
|
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
|
||
|
.collect(),
|
||
|
slow_matchers: Some(
|
||
|
MIME_TYPES
|
||
|
.iter()
|
||
|
.map(|s| FileMatcher::MimeType(s.to_string()))
|
||
|
.collect()
|
||
|
),
|
||
|
disabled_by_default: true,
|
||
|
keep_fast_matchers_if_accurate: true
|
||
|
};
|
||
|
}
|
||
|
#[derive(Default)]
|
||
|
pub struct MboxAdapter;
|
||
|
|
||
|
impl MboxAdapter {
|
||
|
pub fn new() -> MboxAdapter {
|
||
|
MboxAdapter
|
||
|
}
|
||
|
}
|
||
|
impl GetMetadata for MboxAdapter {
|
||
|
fn metadata(&self) -> &AdapterMeta {
|
||
|
&METADATA
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fn get_inner_filename(filename: &Path) -> PathBuf {
|
||
|
let extension = filename
|
||
|
.extension()
|
||
|
.map(|e| e.to_string_lossy())
|
||
|
.unwrap_or(Cow::Borrowed(""));
|
||
|
let stem = filename
|
||
|
.file_stem()
|
||
|
.expect("no filename given?")
|
||
|
.to_string_lossy();
|
||
|
let new_extension = match extension.as_ref() {
|
||
|
"tgz" | "tbz" | "tbz2" => ".tar",
|
||
|
_other => "",
|
||
|
};
|
||
|
filename.with_file_name(format!("{}{}", stem, new_extension))
|
||
|
}
|
||
|
|
||
|
impl FileAdapter for MboxAdapter {
|
||
|
fn adapt(&self, ai: AdaptInfo, _detection_reason: &FileMatcher) -> Result<AdaptedFilesIterBox> {
|
||
|
println!("running mbox adapter");
|
||
|
let AdaptInfo {
|
||
|
filepath_hint,
|
||
|
mut inp,
|
||
|
line_prefix,
|
||
|
archive_recursion_depth,
|
||
|
config,
|
||
|
postprocess,
|
||
|
..
|
||
|
} = ai;
|
||
|
|
||
|
let mut content = String::new();
|
||
|
let s = stream! {
|
||
|
inp.read_to_string(&mut content).await?;
|
||
|
|
||
|
let mut ais = vec![];
|
||
|
for mail in content.split("\nFrom ") {
|
||
|
|
||
|
let mail_bytes = mail.as_bytes(); // &content[offset..offset2];
|
||
|
let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').skip(1).next().unwrap();
|
||
|
let mail = mailparse::parse_mail(mail_content)?;
|
||
|
let mail_body = mail.get_body()?;
|
||
|
println!("body {:?}", mail_body);
|
||
|
|
||
|
let mut path = filepath_hint.clone();
|
||
|
println!("{:?}", mail.ctype.mimetype);
|
||
|
match &*mail.ctype.mimetype {
|
||
|
"text/html" => {
|
||
|
path.push("mail.html");
|
||
|
},
|
||
|
_ => {
|
||
|
path.push("mail.txt");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
let mut config = config.clone();
|
||
|
config.accurate = true;
|
||
|
|
||
|
let ai2: AdaptInfo = AdaptInfo {
|
||
|
filepath_hint: path,
|
||
|
is_real_file: false,
|
||
|
archive_recursion_depth: archive_recursion_depth + 1,
|
||
|
inp: Box::pin(Cursor::new(mail_body.into_bytes())),
|
||
|
line_prefix: line_prefix.to_string(),
|
||
|
config: config,
|
||
|
postprocess,
|
||
|
};
|
||
|
ais.push(ai2);
|
||
|
}
|
||
|
for a in ais {
|
||
|
yield(Ok(a));
|
||
|
}
|
||
|
};
|
||
|
Ok(Box::pin(s))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[cfg(test)]
|
||
|
mod tests {
|
||
|
use super::*;
|
||
|
use crate::preproc::loop_adapt;
|
||
|
use crate::test_utils::*;
|
||
|
use pretty_assertions::assert_eq;
|
||
|
use tokio::fs::File;
|
||
|
|
||
|
#[test]
|
||
|
fn test_inner_filename() {
|
||
|
for (a, b) in &[
|
||
|
("hi/test.tgz", "hi/test.tar"),
|
||
|
("hi/hello.gz", "hi/hello"),
|
||
|
("a/b/initramfs", "a/b/initramfs"),
|
||
|
("hi/test.tbz2", "hi/test.tar"),
|
||
|
("hi/test.tbz", "hi/test.tar"),
|
||
|
("hi/test.hi.bz2", "hi/test.hi"),
|
||
|
("hello.tar.gz", "hello.tar"),
|
||
|
] {
|
||
|
assert_eq!(get_inner_filename(&PathBuf::from(a)), PathBuf::from(*b));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#[tokio::test]
|
||
|
async fn gz() -> Result<()> {
|
||
|
let adapter = MboxAdapter;
|
||
|
|
||
|
let filepath = test_data_dir().join("hello.gz");
|
||
|
|
||
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
||
|
let r = adapter.adapt(a, &d)?;
|
||
|
let o = adapted_to_vec(r).await?;
|
||
|
assert_eq!(String::from_utf8(o)?, "hello\n");
|
||
|
Ok(())
|
||
|
}
|
||
|
|
||
|
#[tokio::test]
|
||
|
async fn pdf_gz() -> Result<()> {
|
||
|
let adapter = MboxAdapter;
|
||
|
|
||
|
let filepath = test_data_dir().join("short.pdf.gz");
|
||
|
|
||
|
let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
|
||
|
let r = loop_adapt(&adapter, d, a)?;
|
||
|
let o = adapted_to_vec(r).await?;
|
||
|
assert_eq!(
|
||
|
String::from_utf8(o)?,
|
||
|
"PREFIX:Page 1: hello world
|
||
|
PREFIX:Page 1: this is just a test.
|
||
|
PREFIX:Page 1:
|
||
|
PREFIX:Page 1: 1
|
||
|
PREFIX:Page 1:
|
||
|
PREFIX:Page 1:
|
||
|
"
|
||
|
);
|
||
|
Ok(())
|
||
|
}
|
||
|
}
|