2
0
mirror of https://github.com/carlostrub/sisyphus synced 2024-11-16 06:12:51 +00:00
sisyphus/mail.go

291 lines
5.8 KiB
Go
Raw Normal View History

2017-04-15 20:23:26 +00:00
package sisyphus
2017-03-09 19:05:51 +00:00
import (
"bufio"
"errors"
"log"
"math"
2017-03-09 19:05:51 +00:00
"mime/quotedprintable"
"os"
2017-03-10 21:41:49 +00:00
"regexp"
2017-03-19 21:43:14 +00:00
"strconv"
2017-03-09 19:05:51 +00:00
"strings"
2017-03-19 21:43:14 +00:00
"github.com/boltdb/bolt"
2017-03-09 19:05:51 +00:00
"github.com/kennygrant/sanitize"
"github.com/luksen/maildir"
)
const (
// Good holds a placeholder string for the database
Good = "0"
// Junk holds a placeholder string for the database
Junk = "1"
)
2017-04-25 17:01:11 +00:00
// Maildir represents the address to a Maildir directory
type Maildir string
2017-03-09 19:05:51 +00:00
// Mail includes the key of a mail in Maildir
type Mail struct {
Key string
Subject, Body *string
2017-03-19 22:35:13 +00:00
Junk, New bool
2017-03-09 19:05:51 +00:00
}
// CreateDirs creates all the required dirs -- if not already there.
2017-04-25 17:01:11 +00:00
func (d Maildir) CreateDirs() {
dir := string(d)
log.Println("create missing directories for Maildir " + dir)
2017-04-25 17:01:11 +00:00
os.MkdirAll(dir+"/.Junk/cur", 0700)
os.MkdirAll(dir+"/new", 0700)
os.MkdirAll(dir+"/cur", 0700)
return
}
2017-03-09 19:05:51 +00:00
// Index loads all mail keys from the Maildir directory for processing.
2017-04-25 17:01:11 +00:00
func (d Maildir) Index() (m []*Mail, err error) {
2017-04-25 17:01:11 +00:00
dir := string(d)
log.Println("start indexing mails in " + dir)
dirs := []string{dir, dir + "/.Junk"}
for _, val := range dirs {
j, err := maildir.Dir(val).Keys()
if err != nil {
return m, err
}
2017-04-25 17:01:11 +00:00
for _, v := range j {
var new Mail
2017-04-25 17:01:11 +00:00
new.Key = v
if val == dir+"/.Junk" {
new.Junk = true
}
m = append(m, &new)
}
2017-03-09 19:05:51 +00:00
}
2017-04-25 17:01:11 +00:00
log.Println("all mails in " + dir + " indexed")
2017-03-09 19:05:51 +00:00
return m, nil
}
// Load reads a mail's subject and body
func (m *Mail) Load(d string) error {
if m.Junk {
d = d + "/.Junk"
}
message, err := maildir.Dir(d).Message(m.Key)
if err != nil {
return err
}
// get Subject
if m.Subject != nil {
return errors.New("there is already a subject")
}
subject := message.Header.Get("Subject")
m.Subject = &subject
// get Body
bQ := quotedprintable.NewReader(message.Body)
var b []string
bScanner := bufio.NewScanner(bQ)
for bScanner.Scan() {
raw := bScanner.Text()
b = append(b, raw)
}
body := strings.Join(b, " ")
if m.Body != nil {
return errors.New("there is already a body")
}
m.Body = &body
return nil
}
2017-03-09 19:05:51 +00:00
func trimStringFromBase64(s string) string {
if idx := strings.Index(s, "Content-Transfer-Encoding: base64"); idx != -1 {
return s[:idx-1]
}
return s
}
func cleanString(i string) (s string) {
2017-03-09 19:05:51 +00:00
s = sanitize.Accents(i)
2017-03-09 19:05:51 +00:00
s = sanitize.HTML(s)
s = strings.ToLower(s)
bad := []string{
"boundary=", "charset", "content-transfer-encoding",
"content-type", "image/jpeg", "multipart/alternative",
"multipart/related", "name=", "nextpart", "quoted-printable",
"text/html", "text/plain", "this email must be viewed in html mode",
"this is a multi-part message in mime format",
"windows-1251", "windows-1252", "!", "#", "$", "%", "&", "'",
"(", ")", "*", "+", ",", ". ", "<", "=", ">", "?", "@", "[",
"\"", "\\", "\n", "\t", "]", "^", "_", "{", "|", "}",
}
for _, b := range bad {
s = strings.Replace(s, b, " ", -1)
}
2017-03-09 19:05:51 +00:00
for i := 0; i < 10; i++ {
s = strings.Replace(s, " ", " ", -1)
}
return s
}
// Clean cleans the mail's subject and body
func (m *Mail) Clean() error {
if m.Subject != nil {
s := trimStringFromBase64(*m.Subject)
s = cleanString(s)
m.Subject = &s
}
if m.Body != nil {
b := trimStringFromBase64(*m.Body)
b = cleanString(b)
m.Body = &b
}
return nil
2017-03-09 19:05:51 +00:00
}
2017-03-10 21:41:49 +00:00
// wordlist takes a string of space separated text and returns a list of unique
// words in a space separated string
2017-03-19 21:43:14 +00:00
func wordlist(s string) (l []string) {
2017-03-10 21:41:49 +00:00
list := make(map[string]int)
raw := strings.Split(s, " ")
var clean []string
2017-03-10 21:41:49 +00:00
for _, w := range raw {
2017-03-10 21:41:49 +00:00
// no long or too short words
length := len(w)
2017-03-10 21:41:49 +00:00
if length < 4 || length > 10 {
continue
}
// no numbers, special characters, etc. -- only words
match, _ := regexp.MatchString("(^[a-z]+$)", w)
2017-03-10 21:41:49 +00:00
if !match {
continue
} else {
clean = append(clean, w)
2017-03-10 21:41:49 +00:00
}
}
// only the first 200 words count
maxWords := int(math.Min(200, float64(len(clean))))
for i := 0; i < maxWords; i++ {
w := clean[i]
list[w]++
}
2017-03-10 21:41:49 +00:00
for word, count := range list {
if count > 10 {
continue
}
l = append(l, word)
}
2017-03-19 21:43:14 +00:00
return l
2017-03-10 21:41:49 +00:00
}
2017-03-19 21:43:14 +00:00
// Wordlist prepares the mail for training
func (m *Mail) Wordlist() (w []string) {
var s string
2017-03-10 21:41:49 +00:00
if m.Subject != nil {
2017-03-19 21:43:14 +00:00
s = s + " " + *m.Subject
2017-03-10 21:41:49 +00:00
}
if m.Body != nil {
2017-03-19 21:43:14 +00:00
s = s + " " + *m.Body
2017-03-10 21:41:49 +00:00
}
2017-03-19 21:43:14 +00:00
w = wordlist(s)
return w
2017-03-10 21:41:49 +00:00
}
// Classify analyses the mail and decides whether it is Junk or Good
2017-03-19 21:43:14 +00:00
func (m *Mail) Classify(db *bolt.DB) error {
err := m.Clean()
if err != nil {
return err
}
list := m.Wordlist()
2017-03-19 22:35:13 +00:00
scoreG, scoreJ, ju := LogScores(db, list)
2017-03-19 21:43:14 +00:00
log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk) +
" (good: " + strconv.FormatFloat(scoreG, 'f', 4, 64) +
", junk: " + strconv.FormatFloat(scoreJ, 'f', 4, 64) + ")")
2017-03-09 19:05:51 +00:00
2017-03-19 22:35:13 +00:00
// Move mails around after classification
if m.New && ju {
m.Junk = ju
err := os.Rename("./new/"+m.Key, "./.Junk/cur/"+m.Key)
if err != nil {
return err
}
log.Print("Moved " + m.Key + " from new to Junk folder")
}
if m.New == false && m.Junk && ju == false {
err := os.Rename("./.Junk/cur/"+m.Key, "./cur/"+m.Key)
if err != nil {
return err
}
m.Junk = ju
log.Print("Moved " + m.Key + " from Junk to Good folder")
}
if m.New == false && ju && m.Junk == false {
err := os.Rename("./cur/"+m.Key, "./.Junk/cur/"+m.Key)
if err != nil {
return err
}
m.Junk = ju
log.Print("Moved " + m.Key + " from Good to Junk folder")
}
// Inform the DB about a processed mail
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
bMails := b.Bucket([]byte("Mails"))
if ju {
err := bMails.Put([]byte(m.Key), []byte(Junk))
2017-03-19 22:35:13 +00:00
if err != nil {
return err
}
} else {
err := bMails.Put([]byte(m.Key), []byte(Good))
2017-03-19 22:35:13 +00:00
if err != nil {
return err
}
}
return err
})
2017-03-09 19:05:51 +00:00
return nil
}
// Learn adds the words to the respective list and unlearns on the other, if
// the mail has been moved from there.
2017-03-19 22:35:13 +00:00
func (m *Mail) Learn(db *bolt.DB) error {
return nil
}