2017-04-15 20:23:26 +00:00
|
|
|
package sisyphus
|
2017-03-09 19:05:51 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"errors"
|
2017-09-09 22:53:11 +00:00
|
|
|
"fmt"
|
2017-03-16 20:13:39 +00:00
|
|
|
"math"
|
2017-03-09 19:05:51 +00:00
|
|
|
"mime/quotedprintable"
|
2017-06-05 13:33:32 +00:00
|
|
|
"net/mail"
|
2017-03-19 20:54:23 +00:00
|
|
|
"os"
|
2017-03-10 21:41:49 +00:00
|
|
|
"regexp"
|
2017-03-09 19:05:51 +00:00
|
|
|
"strings"
|
2017-09-09 22:53:11 +00:00
|
|
|
"unicode/utf8"
|
2017-03-09 19:05:51 +00:00
|
|
|
|
2017-05-18 19:21:46 +00:00
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
|
2017-06-05 13:33:32 +00:00
|
|
|
"github.com/carlostrub/maildir"
|
2017-03-09 19:05:51 +00:00
|
|
|
"github.com/kennygrant/sanitize"
|
|
|
|
)
|
|
|
|
|
2017-04-25 17:01:11 +00:00
|
|
|
// Maildir represents the address to a Maildir directory
|
|
|
|
type Maildir string
|
|
|
|
|
2017-03-09 19:05:51 +00:00
|
|
|
// Mail includes the key of a mail in Maildir
|
|
|
|
type Mail struct {
|
|
|
|
Key string
|
|
|
|
Subject, Body *string
|
2017-03-19 22:35:13 +00:00
|
|
|
Junk, New bool
|
2017-03-09 19:05:51 +00:00
|
|
|
}
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
// CreateDirs creates all the required dirs -- if not already there.
|
2017-04-26 11:20:21 +00:00
|
|
|
func (d Maildir) CreateDirs() error {
|
2017-04-25 17:01:11 +00:00
|
|
|
|
|
|
|
dir := string(d)
|
|
|
|
|
2017-05-23 20:43:17 +00:00
|
|
|
log.WithFields(log.Fields{
|
|
|
|
"dir": dir,
|
|
|
|
}).Info("Create missing directories")
|
2017-03-19 20:54:23 +00:00
|
|
|
|
2017-04-26 11:20:21 +00:00
|
|
|
err := os.MkdirAll(dir+"/.Junk/cur", 0700)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = os.MkdirAll(dir+"/new", 0700)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = os.MkdirAll(dir+"/cur", 0700)
|
2017-03-19 20:54:23 +00:00
|
|
|
|
2017-04-26 11:20:21 +00:00
|
|
|
return err
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|
|
|
|
|
2017-03-09 19:05:51 +00:00
|
|
|
// Index loads all mail keys from the Maildir directory for processing.
|
2017-04-25 17:01:11 +00:00
|
|
|
func (d Maildir) Index() (m []*Mail, err error) {
|
2017-03-16 21:14:24 +00:00
|
|
|
|
2017-04-25 17:01:11 +00:00
|
|
|
dir := string(d)
|
|
|
|
|
2017-05-23 20:43:17 +00:00
|
|
|
log.WithFields(log.Fields{
|
|
|
|
"dir": dir,
|
|
|
|
}).Info("Start indexing mails")
|
|
|
|
|
2017-04-25 17:01:11 +00:00
|
|
|
dirs := []string{dir, dir + "/.Junk"}
|
|
|
|
for _, val := range dirs {
|
|
|
|
j, err := maildir.Dir(val).Keys()
|
2017-03-16 21:14:24 +00:00
|
|
|
if err != nil {
|
|
|
|
return m, err
|
|
|
|
}
|
2017-04-25 17:01:11 +00:00
|
|
|
for _, v := range j {
|
2017-03-16 21:14:24 +00:00
|
|
|
var new Mail
|
2017-04-25 17:01:11 +00:00
|
|
|
new.Key = v
|
|
|
|
if val == dir+"/.Junk" {
|
2017-03-16 21:14:24 +00:00
|
|
|
new.Junk = true
|
|
|
|
}
|
|
|
|
m = append(m, &new)
|
|
|
|
}
|
2017-03-09 19:05:51 +00:00
|
|
|
}
|
|
|
|
|
2017-05-23 20:43:17 +00:00
|
|
|
log.WithFields(log.Fields{
|
|
|
|
"dir": dir,
|
|
|
|
}).Info("All mails indexed")
|
2017-04-25 17:01:11 +00:00
|
|
|
|
2017-03-09 19:05:51 +00:00
|
|
|
return m, nil
|
|
|
|
}
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
// Load reads a mail's subject and body
|
2017-06-05 13:33:32 +00:00
|
|
|
func (m *Mail) Load(dir Maildir) (err error) {
|
2017-03-19 20:54:23 +00:00
|
|
|
|
2017-06-05 13:33:32 +00:00
|
|
|
var message *mail.Message
|
|
|
|
|
|
|
|
switch {
|
|
|
|
case m.Junk:
|
|
|
|
dir = dir + Maildir("/.Junk")
|
|
|
|
case m.New:
|
|
|
|
dir = dir + Maildir("/new")
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|
2017-06-05 13:33:32 +00:00
|
|
|
|
|
|
|
message, err = maildir.Dir(dir).Message(m.Key)
|
2017-03-19 20:54:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// get Subject
|
|
|
|
if m.Subject != nil {
|
|
|
|
return errors.New("there is already a subject")
|
|
|
|
}
|
|
|
|
subject := message.Header.Get("Subject")
|
|
|
|
m.Subject = &subject
|
|
|
|
|
|
|
|
// get Body
|
|
|
|
bQ := quotedprintable.NewReader(message.Body)
|
|
|
|
var b []string
|
|
|
|
bScanner := bufio.NewScanner(bQ)
|
|
|
|
for bScanner.Scan() {
|
|
|
|
raw := bScanner.Text()
|
|
|
|
b = append(b, raw)
|
|
|
|
}
|
|
|
|
|
|
|
|
body := strings.Join(b, " ")
|
|
|
|
if m.Body != nil {
|
|
|
|
return errors.New("there is already a body")
|
|
|
|
}
|
|
|
|
m.Body = &body
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-09-16 22:56:17 +00:00
|
|
|
// Unload removes a mail's subject and body from the internal cache
|
|
|
|
func (m *Mail) Unload(dir Maildir) (err error) {
|
|
|
|
|
|
|
|
m.Subject = nil
|
|
|
|
m.Body = nil
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-03-09 19:05:51 +00:00
|
|
|
func trimStringFromBase64(s string) string {
|
|
|
|
if idx := strings.Index(s, "Content-Transfer-Encoding: base64"); idx != -1 {
|
|
|
|
return s[:idx-1]
|
|
|
|
}
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
func cleanString(i string) (s string) {
|
2017-03-09 19:05:51 +00:00
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
s = sanitize.Accents(i)
|
2017-03-09 19:05:51 +00:00
|
|
|
s = sanitize.HTML(s)
|
|
|
|
s = strings.ToLower(s)
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
bad := []string{
|
|
|
|
"boundary=", "charset", "content-transfer-encoding",
|
|
|
|
"content-type", "image/jpeg", "multipart/alternative",
|
|
|
|
"multipart/related", "name=", "nextpart", "quoted-printable",
|
|
|
|
"text/html", "text/plain", "this email must be viewed in html mode",
|
|
|
|
"this is a multi-part message in mime format",
|
|
|
|
"windows-1251", "windows-1252", "!", "#", "$", "%", "&", "'",
|
|
|
|
"(", ")", "*", "+", ",", ". ", "<", "=", ">", "?", "@", "[",
|
|
|
|
"\"", "\\", "\n", "\t", "]", "^", "_", "{", "|", "}",
|
|
|
|
}
|
|
|
|
for _, b := range bad {
|
|
|
|
s = strings.Replace(s, b, " ", -1)
|
|
|
|
}
|
2017-03-09 19:05:51 +00:00
|
|
|
for i := 0; i < 10; i++ {
|
|
|
|
s = strings.Replace(s, " ", " ", -1)
|
|
|
|
}
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clean cleans the mail's subject and body
|
|
|
|
func (m *Mail) Clean() error {
|
|
|
|
if m.Subject != nil {
|
|
|
|
s := trimStringFromBase64(*m.Subject)
|
|
|
|
s = cleanString(s)
|
|
|
|
m.Subject = &s
|
|
|
|
}
|
|
|
|
|
|
|
|
if m.Body != nil {
|
|
|
|
b := trimStringFromBase64(*m.Body)
|
|
|
|
b = cleanString(b)
|
|
|
|
m.Body = &b
|
|
|
|
}
|
2017-09-09 22:53:11 +00:00
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
return nil
|
2017-03-09 19:05:51 +00:00
|
|
|
}
|
|
|
|
|
2017-03-10 21:41:49 +00:00
|
|
|
// wordlist takes a string of space separated text and returns a list of unique
|
|
|
|
// words in a space separated string
|
2017-05-10 17:23:25 +00:00
|
|
|
func wordlist(s string) (l []string, err error) {
|
2017-03-10 21:41:49 +00:00
|
|
|
list := make(map[string]int)
|
|
|
|
|
|
|
|
raw := strings.Split(s, " ")
|
2017-03-16 20:13:39 +00:00
|
|
|
var clean []string
|
2017-03-10 21:41:49 +00:00
|
|
|
|
2017-04-26 11:20:21 +00:00
|
|
|
// use regexp compile for use in the loop that follows
|
2017-05-10 17:23:25 +00:00
|
|
|
regexMatcher, err := regexp.Compile("(^[a-z]+$)")
|
|
|
|
if err != nil {
|
|
|
|
return l, err
|
|
|
|
}
|
2017-04-26 11:20:21 +00:00
|
|
|
|
2017-03-16 20:13:39 +00:00
|
|
|
for _, w := range raw {
|
2017-09-09 22:53:11 +00:00
|
|
|
str := w
|
|
|
|
for len(str) > 0 {
|
|
|
|
r, size := utf8.DecodeLastRuneInString(str)
|
2017-09-16 20:07:36 +00:00
|
|
|
if size > 2 {
|
2017-09-09 22:53:11 +00:00
|
|
|
clean = append(clean, fmt.Sprintf("%c", r))
|
|
|
|
}
|
|
|
|
|
|
|
|
str = str[:len(str)-size]
|
|
|
|
}
|
2017-03-10 21:41:49 +00:00
|
|
|
|
|
|
|
// no long or too short words
|
2017-03-16 20:13:39 +00:00
|
|
|
length := len(w)
|
2017-03-10 21:41:49 +00:00
|
|
|
if length < 4 || length > 10 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// no numbers, special characters, etc. -- only words
|
2017-04-26 11:20:21 +00:00
|
|
|
match := regexMatcher.MatchString(w)
|
2017-03-10 21:41:49 +00:00
|
|
|
if !match {
|
|
|
|
continue
|
|
|
|
} else {
|
2017-03-16 20:13:39 +00:00
|
|
|
clean = append(clean, w)
|
2017-03-10 21:41:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-16 20:13:39 +00:00
|
|
|
// only the first 200 words count
|
|
|
|
maxWords := int(math.Min(200, float64(len(clean))))
|
|
|
|
for i := 0; i < maxWords; i++ {
|
|
|
|
w := clean[i]
|
|
|
|
list[w]++
|
|
|
|
}
|
|
|
|
|
2017-03-10 21:41:49 +00:00
|
|
|
for word, count := range list {
|
|
|
|
if count > 10 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
l = append(l, word)
|
|
|
|
}
|
|
|
|
|
2017-05-10 17:23:25 +00:00
|
|
|
return l, nil
|
2017-03-10 21:41:49 +00:00
|
|
|
}
|
|
|
|
|
2017-03-19 21:43:14 +00:00
|
|
|
// Wordlist prepares the mail for training
|
2017-05-10 17:23:25 +00:00
|
|
|
func (m *Mail) Wordlist() (w []string, err error) {
|
2017-03-19 21:43:14 +00:00
|
|
|
var s string
|
|
|
|
|
2017-03-10 21:41:49 +00:00
|
|
|
if m.Subject != nil {
|
2017-03-19 21:43:14 +00:00
|
|
|
s = s + " " + *m.Subject
|
2017-03-10 21:41:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if m.Body != nil {
|
2017-03-19 21:43:14 +00:00
|
|
|
s = s + " " + *m.Body
|
2017-03-10 21:41:49 +00:00
|
|
|
}
|
|
|
|
|
2017-05-10 17:23:25 +00:00
|
|
|
w, err = wordlist(s)
|
2017-03-19 21:43:14 +00:00
|
|
|
|
2017-05-10 17:23:25 +00:00
|
|
|
return w, err
|
2017-03-10 21:41:49 +00:00
|
|
|
}
|
|
|
|
|
2017-05-13 22:34:54 +00:00
|
|
|
// cleanWordlist combines Clean and Wordlist in one internal function
|
|
|
|
func (m *Mail) cleanWordlist() (w []string, err error) {
|
|
|
|
err = m.Clean()
|
|
|
|
if err != nil {
|
|
|
|
return w, err
|
|
|
|
}
|
|
|
|
|
|
|
|
w, err = m.Wordlist()
|
|
|
|
|
|
|
|
return w, err
|
|
|
|
}
|
|
|
|
|
2017-06-05 16:20:03 +00:00
|
|
|
// LoadMails loads all mails from a given slice of Maildirs
|
2017-05-08 03:29:25 +00:00
|
|
|
func LoadMails(d []Maildir) (mails map[Maildir][]*Mail, err error) {
|
|
|
|
mails = make(map[Maildir][]*Mail)
|
|
|
|
|
|
|
|
// create missing directories and write index
|
|
|
|
for _, val := range d {
|
|
|
|
var m []*Mail
|
|
|
|
m, err = val.Index()
|
|
|
|
if err != nil {
|
|
|
|
return mails, err
|
|
|
|
}
|
|
|
|
|
|
|
|
mails[val] = m
|
|
|
|
}
|
|
|
|
|
|
|
|
return mails, nil
|
|
|
|
}
|
2017-06-05 16:20:03 +00:00
|
|
|
|
|
|
|
// LoadMaildirs creates Maildirs and required directories, if missing
|
|
|
|
func LoadMaildirs(d []Maildir) (err error) {
|
|
|
|
|
|
|
|
// create missing directories and write index
|
|
|
|
for _, val := range d {
|
|
|
|
err := val.CreateDirs()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|