mirror of https://github.com/terhechte/postsack
Cleanup
parent
e3a41158d0
commit
976004fbe3
@ -0,0 +1,20 @@
|
|||||||
|
use rusqlite::{self, Error, Row};
|
||||||
|
|
||||||
|
pub trait RowConversion: Sized {
|
||||||
|
fn from_row<'stmt>(row: &Row<'stmt>) -> Result<Self, Error>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*impl RowConversion for EmailEntry {
|
||||||
|
fn from_row<'stmt>(row: &Row<'stmt>) -> Result<Self, Error> {
|
||||||
|
let path: String = row.get("path")?;
|
||||||
|
let domain: String = row.get("domain")?;
|
||||||
|
let local_part: String = row.get("local_part")?;
|
||||||
|
let year: usize = row.get("year")?;
|
||||||
|
let month: usize = row.get("month")?;
|
||||||
|
let day: usize = row.get("day")?;
|
||||||
|
let created = email_parser::time::DateTime::
|
||||||
|
Ok(EmailEntry {
|
||||||
|
path, domain, local_part, year, month, day
|
||||||
|
})
|
||||||
|
}
|
||||||
|
*/
|
@ -0,0 +1,16 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use eyre::Report;
|
||||||
|
|
||||||
|
use crate::types::EmailEntry;
|
||||||
|
|
||||||
|
/// Parameter for sending work to the database during `import`.
|
||||||
|
pub enum DBMessage {
|
||||||
|
/// Send for a successfuly parsed mail
|
||||||
|
Mail(EmailEntry),
|
||||||
|
/// Send for any kind of error during reading / parsing
|
||||||
|
Error(Report, PathBuf),
|
||||||
|
/// Send once all parsing is done.
|
||||||
|
/// This is used to break out of the receiving loop
|
||||||
|
Done,
|
||||||
|
}
|
@ -0,0 +1,8 @@
|
|||||||
|
mod conversion;
|
||||||
|
mod database;
|
||||||
|
mod db_message;
|
||||||
|
mod sql;
|
||||||
|
|
||||||
|
pub use conversion::RowConversion;
|
||||||
|
pub use database::Database;
|
||||||
|
pub use db_message::DBMessage;
|
@ -0,0 +1,30 @@
|
|||||||
|
pub const TBL_EMAILS: &str = r#"
|
||||||
|
CREATE TABLE IF NOT EXISTS emails (
|
||||||
|
path TEXT NOT NULL,
|
||||||
|
domain TEXT NOT NULL,
|
||||||
|
local_part TEXT NOT NULL,
|
||||||
|
year INTEGER NOT NULL,
|
||||||
|
month INTEGER NOT NULL,
|
||||||
|
day INTEGER NOT NULL,
|
||||||
|
subject TEXT NOT NULL
|
||||||
|
);"#;
|
||||||
|
|
||||||
|
pub const TBL_ERRORS: &str = r#"
|
||||||
|
CREATE TABLE IF NOT EXISTS errors (
|
||||||
|
message TEXT NOT NULL,
|
||||||
|
path TEXT NOT NULL
|
||||||
|
);"#;
|
||||||
|
|
||||||
|
pub const QUERY_EMAILS: &str = r#"
|
||||||
|
INSERT INTO emails
|
||||||
|
(path, domain, local_part, year, month, day, subject)
|
||||||
|
VALUES
|
||||||
|
(?, ?, ?, ?, ?, ?, ?)
|
||||||
|
"#;
|
||||||
|
|
||||||
|
pub const QUERY_ERRORS: &str = r#"
|
||||||
|
INSERT INTO errors
|
||||||
|
(message, path)
|
||||||
|
VALUES
|
||||||
|
(?, ?)
|
||||||
|
"#;
|
@ -1,376 +0,0 @@
|
|||||||
use chrono::prelude::*;
|
|
||||||
use email_address_parser;
|
|
||||||
use eyre::{bail, eyre, Result, WrapErr};
|
|
||||||
use flate2;
|
|
||||||
use flate2::read::GzDecoder;
|
|
||||||
use rayon::prelude::*;
|
|
||||||
use serde::Deserialize;
|
|
||||||
use serde_json;
|
|
||||||
use strum_macros;
|
|
||||||
|
|
||||||
const SENDER_HEADER_NAMES: &[&str] = &["Sender", "Reply-to", "From"];
|
|
||||||
const DATE_HEADER_NAMES: &[&str] = &["Received", "Date"];
|
|
||||||
|
|
||||||
use std::{
|
|
||||||
convert::{TryFrom, TryInto},
|
|
||||||
io::Read,
|
|
||||||
path::{Path, PathBuf},
|
|
||||||
};
|
|
||||||
|
|
||||||
/// We want to know which library was used to parse this email
|
|
||||||
#[derive(Debug, strum_macros::EnumString, strum_macros::ToString)]
|
|
||||||
pub enum ParserKind {
|
|
||||||
EmailParser,
|
|
||||||
Eml,
|
|
||||||
Rhymessage,
|
|
||||||
Meta,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Representation of an email
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct EmailEntry {
|
|
||||||
pub path: PathBuf,
|
|
||||||
pub domain: String,
|
|
||||||
pub local_part: String,
|
|
||||||
pub datetime: chrono::DateTime<Utc>,
|
|
||||||
pub subject: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Raw representation of an email.
|
|
||||||
/// Contains the paths to the relevant files as well
|
|
||||||
/// as the name of the folder the email was in.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct RawEmailEntry {
|
|
||||||
folder_name: String,
|
|
||||||
eml_path: PathBuf,
|
|
||||||
meta_path: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RawEmailEntry {
|
|
||||||
pub fn new<P: AsRef<std::path::Path>>(path: P) -> RawEmailEntry {
|
|
||||||
let path = path.as_ref();
|
|
||||||
let folder_name = path
|
|
||||||
.parent()
|
|
||||||
.unwrap()
|
|
||||||
.file_name()
|
|
||||||
.unwrap()
|
|
||||||
.to_str()
|
|
||||||
.unwrap()
|
|
||||||
.to_owned();
|
|
||||||
let eml_path = path.to_path_buf();
|
|
||||||
let meta_path = path
|
|
||||||
.parent()
|
|
||||||
.unwrap()
|
|
||||||
.join(format!(
|
|
||||||
"{}.meta",
|
|
||||||
&path
|
|
||||||
.file_stem()
|
|
||||||
.unwrap()
|
|
||||||
.to_str()
|
|
||||||
.unwrap()
|
|
||||||
.replace(".eml", "")
|
|
||||||
))
|
|
||||||
.to_path_buf();
|
|
||||||
RawEmailEntry {
|
|
||||||
folder_name,
|
|
||||||
eml_path,
|
|
||||||
meta_path,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn path(&self) -> PathBuf {
|
|
||||||
self.eml_path.clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Emails {
|
|
||||||
/// The current index in the Vec of emails
|
|
||||||
curr: usize,
|
|
||||||
/// The `Vec` with the `EmailEntry` entries
|
|
||||||
pub emails: Vec<RawEmailEntry>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Emails {
|
|
||||||
pub fn new<A: AsRef<Path>>(folder: A) -> Result<Self> {
|
|
||||||
let folder = folder.as_ref();
|
|
||||||
if !folder.exists() {
|
|
||||||
bail!("Folder {} does not exist", &folder.display());
|
|
||||||
}
|
|
||||||
let emails = read_folders(&folder)?;
|
|
||||||
Ok(Emails { curr: 0, emails })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.emails.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//impl Iterator for Emails {
|
|
||||||
// // We can refer to this type using Self::Item
|
|
||||||
// type Item = Result<EmailEntry>;
|
|
||||||
//
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// let new_next = self.curr + 1;
|
|
||||||
// let entry = self.emails.get(self.curr)?;
|
|
||||||
// self.curr = new_next;
|
|
||||||
// let email = read_email(&entry);
|
|
||||||
// Some(email)
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
//impl ParallelIterator for Emails {
|
|
||||||
// type Item = Result<EmailEntry>;
|
|
||||||
//
|
|
||||||
// fn drive_unindexed<C>(self, consumer: C) -> C::Result
|
|
||||||
// where
|
|
||||||
// C: rayon::iter::plumbing::UnindexedConsumer<Self::Item>,
|
|
||||||
// {
|
|
||||||
// self.emails
|
|
||||||
// .into_par_iter()
|
|
||||||
// .map(|e| read_email(&e))
|
|
||||||
// .drive_unindexed(consumer)
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
fn read_folders(folder: &Path) -> Result<Vec<RawEmailEntry>> {
|
|
||||||
Ok(std::fs::read_dir(&folder)?
|
|
||||||
.into_iter()
|
|
||||||
.par_bridge()
|
|
||||||
.filter_map(|entry| {
|
|
||||||
let path = entry
|
|
||||||
.map_err(|e| tracing::error!("{} {:?}", &folder.display(), &e))
|
|
||||||
.ok()?
|
|
||||||
.path();
|
|
||||||
if !path.is_dir() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
read_emails(&path)
|
|
||||||
.map_err(|e| tracing::error!("{} {:?}", &path.display(), &e))
|
|
||||||
.ok()
|
|
||||||
})
|
|
||||||
.flatten()
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_emails(folder_path: &Path) -> Result<Vec<RawEmailEntry>> {
|
|
||||||
Ok(std::fs::read_dir(folder_path)?
|
|
||||||
.into_iter()
|
|
||||||
.par_bridge()
|
|
||||||
.filter_map(|entry| {
|
|
||||||
let path = entry
|
|
||||||
.map_err(|e| tracing::error!("{} {:?}", &folder_path.display(), &e))
|
|
||||||
.ok()?
|
|
||||||
.path();
|
|
||||||
if path.is_dir() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
if !path.extension()?.eq("gz") {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
Some(RawEmailEntry {
|
|
||||||
folder_name: folder_path.file_name()?.to_str()?.to_string(),
|
|
||||||
eml_path: path.clone(),
|
|
||||||
meta_path: path
|
|
||||||
.parent()?
|
|
||||||
.join(format!(
|
|
||||||
"{}.meta",
|
|
||||||
&path.file_stem()?.to_str()?.replace(".eml", "")
|
|
||||||
))
|
|
||||||
.to_path_buf(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
//.take(50)
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn read_email(raw_entry: &RawEmailEntry) -> Result<EmailEntry> {
|
|
||||||
let content = unziped_content(&raw_entry.eml_path)?;
|
|
||||||
// We have to try multiple different email readers as each of them seems to fail in a different way
|
|
||||||
let email = parse_email_parser(&raw_entry, &content).or_else(|e| {
|
|
||||||
tracing::trace!("Parser Error: {:?}", &e);
|
|
||||||
parse_meta(&raw_entry, &content)
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(email.wrap_err_with(|| {
|
|
||||||
format!(
|
|
||||||
"{}\n{:?}",
|
|
||||||
String::from_utf8(content.clone()).unwrap(),
|
|
||||||
&raw_entry
|
|
||||||
)
|
|
||||||
})?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_email_parser(raw_entry: &RawEmailEntry, content: &Vec<u8>) -> Result<EmailEntry> {
|
|
||||||
match email_parser::email::Email::parse(&content) {
|
|
||||||
Ok(email) => (&raw_entry.eml_path, email).try_into(),
|
|
||||||
Err(error) => {
|
|
||||||
//let content_string = String::from_utf8(content.clone())?;
|
|
||||||
//println!("{}|{}", &error, &raw_entry.eml_path.display());
|
|
||||||
Err(eyre!("Could not parse email: {:?}", &error))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_meta(raw_entry: &RawEmailEntry, _content: &Vec<u8>) -> Result<EmailEntry> {
|
|
||||||
use chrono::prelude::*;
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Meta {
|
|
||||||
msg_id: String,
|
|
||||||
internal_date: i64,
|
|
||||||
subject: String,
|
|
||||||
}
|
|
||||||
let content = std::fs::read_to_string(&raw_entry.meta_path)?;
|
|
||||||
let meta: Meta = serde_json::from_str(&content)?;
|
|
||||||
let parsed = email_address_parser::EmailAddress::parse(&meta.msg_id, None)
|
|
||||||
.ok_or(eyre!("Cannot Parse Address: {}", &meta.msg_id))?;
|
|
||||||
let datetime = Utc.timestamp(meta.internal_date, 0);
|
|
||||||
Ok(EmailEntry {
|
|
||||||
path: raw_entry.eml_path.to_path_buf(),
|
|
||||||
domain: parsed.get_domain().to_owned(),
|
|
||||||
local_part: parsed.get_local_part().to_owned(),
|
|
||||||
datetime,
|
|
||||||
subject: meta.subject.clone(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> TryFrom<(&PathBuf, email_parser::email::Email<'a>)> for EmailEntry {
|
|
||||||
type Error = eyre::Report;
|
|
||||||
fn try_from(content: (&PathBuf, email_parser::email::Email)) -> Result<Self, Self::Error> {
|
|
||||||
let (path, email) = content;
|
|
||||||
let domain = email.sender.address.domain.to_string();
|
|
||||||
let local_part = email.sender.address.local_part.to_string();
|
|
||||||
let datetime = emaildatetime_to_chrono(&email.date);
|
|
||||||
let subject = email.subject.map(|e| e.to_string()).unwrap_or_default();
|
|
||||||
|
|
||||||
Ok(EmailEntry {
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
domain,
|
|
||||||
local_part,
|
|
||||||
datetime,
|
|
||||||
subject,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn emaildatetime_to_chrono(dt: &email_parser::time::DateTime) -> chrono::DateTime<Utc> {
|
|
||||||
Utc.ymd(
|
|
||||||
dt.date.year as i32,
|
|
||||||
dt.date.month_number() as u32,
|
|
||||||
dt.date.day as u32,
|
|
||||||
)
|
|
||||||
.and_hms(
|
|
||||||
dt.time.time.hour as u32,
|
|
||||||
dt.time.time.minute as u32,
|
|
||||||
dt.time.time.second as u32,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn unziped_content(path: &Path) -> Result<Vec<u8>> {
|
|
||||||
let reader = std::fs::File::open(path)?;
|
|
||||||
let mut decoder = GzDecoder::new(reader);
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
decoder.read_to_end(&mut buffer)?;
|
|
||||||
Ok(buffer)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Try to parse unstructed data into some sort of
|
|
||||||
/// email address
|
|
||||||
//fn parse_unstructured(data: &str) -> Option<eml_parser::eml::EmailAddress> {
|
|
||||||
// use lazy_static::lazy_static;
|
|
||||||
// use regex::Regex;
|
|
||||||
// lazy_static! {
|
|
||||||
// static ref EMAIL_RE: Regex = Regex::new(r#"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"#).unwrap();
|
|
||||||
// }
|
|
||||||
// lazy_static! {
|
|
||||||
// static ref RE: Regex = Regex::new("<(.*?)>").unwrap();
|
|
||||||
// }
|
|
||||||
// if let Some(capture) = RE.captures(&data).and_then(|f| f.get(1)) {
|
|
||||||
// Some(eml_parser::eml::EmailAddress::AddressOnly {
|
|
||||||
// address: capture.as_str().to_string(),
|
|
||||||
// })
|
|
||||||
// } else {
|
|
||||||
// let capture = EMAIL_RE.captures(&data).and_then(|f| f.get(0))?;
|
|
||||||
// Some(eml_parser::eml::EmailAddress::AddressOnly {
|
|
||||||
// address: capture.as_str().to_string(),
|
|
||||||
// })
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
//fn extract_address(from: &eml_parser::eml::EmailAddress) -> String {
|
|
||||||
// use eml_parser::eml::EmailAddress::*;
|
|
||||||
// match from {
|
|
||||||
// AddressOnly { address } => address.clone(),
|
|
||||||
// NameAndEmailAddress { name: _, address } => address.clone(),
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::{path::PathBuf, str::FromStr};
|
|
||||||
|
|
||||||
use super::RawEmailEntry;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
//fn test_weird_email1() {
|
|
||||||
// let data = "No Reply <no-reply@evernote.com>, terhechte.5cffa@m.evernote.com";
|
|
||||||
// let address = super::parse_unstructured(&data).unwrap();
|
|
||||||
// assert_eq!(
|
|
||||||
// address,
|
|
||||||
// eml_parser::eml::EmailAddress::AddressOnly {
|
|
||||||
// address: "no-reply@evernote.com".to_owned()
|
|
||||||
// }
|
|
||||||
// );
|
|
||||||
//}
|
|
||||||
#[test]
|
|
||||||
//fn test_weird_email2() {
|
|
||||||
// let data = r#"info@sport-news.denReply-To:info"@sport-news.denX-Mailer:Sport-News.de"#;
|
|
||||||
// let address = super::parse_unstructured(&data).unwrap();
|
|
||||||
// assert_eq!(
|
|
||||||
// address,
|
|
||||||
// eml_parser::eml::EmailAddress::AddressOnly {
|
|
||||||
// address: "info@sport-news.den".to_owned()
|
|
||||||
// }
|
|
||||||
// );
|
|
||||||
//}
|
|
||||||
#[test]
|
|
||||||
fn test_weird_email3() {
|
|
||||||
crate::setup();
|
|
||||||
let eml_path = PathBuf::from_str(
|
|
||||||
"/Users/terhechte/Documents/gmail_backup/db/2014-09/1479692635489080640.eml.gz",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let meta_path = PathBuf::from_str(
|
|
||||||
"/Users/terhechte/Documents/gmail_backup/db/2014-09/1479692635489080640.meta",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let r = RawEmailEntry {
|
|
||||||
folder_name: "2014-09".to_owned(),
|
|
||||||
eml_path,
|
|
||||||
meta_path,
|
|
||||||
};
|
|
||||||
//let result = super::read_email(&r).expect("");
|
|
||||||
let content = Vec::new();
|
|
||||||
let result = super::parse_meta(&r, &content).expect("");
|
|
||||||
dbg!(&result);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_weird_email4() {
|
|
||||||
crate::setup();
|
|
||||||
let eml_path = PathBuf::from_str(
|
|
||||||
"/Users/terhechte/Documents/gmail_backup/db/2014-08/1475705321427236077.eml.gz",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let meta_path = PathBuf::from_str(
|
|
||||||
"/Users/terhechte/Documents/gmail_backup/db/2014-08/1475705321427236077.meta",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let r = RawEmailEntry {
|
|
||||||
folder_name: "2014-08".to_owned(),
|
|
||||||
eml_path,
|
|
||||||
meta_path,
|
|
||||||
};
|
|
||||||
let result = super::read_email(&r).expect("");
|
|
||||||
dbg!(&result);
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue