mirror of
https://github.com/carlostrub/sisyphus
synced 2024-10-31 09:20:15 +00:00
208 lines
4.6 KiB
Go
208 lines
4.6 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"mime/quotedprintable"
|
|
"os"
|
|
"strings"
|
|
|
|
"github.com/jbrukh/bayesian"
|
|
"github.com/kennygrant/sanitize"
|
|
"github.com/luksen/maildir"
|
|
)
|
|
|
|
const (
|
|
// good is the class of good mails that are not supposed to be Spam
|
|
good bayesian.Class = "Good"
|
|
// junk is the class of Spam mails
|
|
junk bayesian.Class = "Junk"
|
|
)
|
|
|
|
var (
|
|
// Processed is a map of e-mail IDs and the value set to true if Junk
|
|
Processed map[string]bool
|
|
)
|
|
|
|
// Mail includes the key of a mail in Maildir
|
|
type Mail struct {
|
|
Key string
|
|
Subject, Body *string
|
|
Junk bool
|
|
}
|
|
|
|
// Classifiers contains the classifiers for mail subjects and bodies
|
|
type Classifiers struct {
|
|
Subject, Body *bayesian.Classifier
|
|
}
|
|
|
|
// Index loads all mail keys from the Maildir directory for processing.
|
|
func Index(d string) (m []*Mail, err error) {
|
|
|
|
g, err := maildir.Dir(d).Keys()
|
|
if err != nil {
|
|
return m, err
|
|
}
|
|
for _, val := range g {
|
|
var new Mail
|
|
new.Key = val
|
|
m = append(m, &new)
|
|
}
|
|
|
|
j, err := maildir.Dir(d + "/.Junk").Keys()
|
|
if err != nil {
|
|
return m, err
|
|
}
|
|
for _, val := range j {
|
|
var new Mail
|
|
new.Key = val
|
|
new.Junk = true
|
|
m = append(m, &new)
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// Learn initially classifies all mails and returns the respective classifiers.
|
|
func (m *Mail) Learn() (c Classifiers, err error) {
|
|
return
|
|
}
|
|
|
|
func trimStringFromBase64(s string) string {
|
|
if idx := strings.Index(s, "Content-Transfer-Encoding: base64"); idx != -1 {
|
|
return s[:idx-1]
|
|
}
|
|
return s
|
|
}
|
|
|
|
func cleanString(i string) (s string, err error) {
|
|
|
|
s = trimStringFromBase64(i)
|
|
s = sanitize.Accents(s)
|
|
s = sanitize.HTML(s)
|
|
s = strings.ToLower(s)
|
|
s = strings.Replace(s, "!", " ", -1)
|
|
s = strings.Replace(s, "#", " ", -1)
|
|
s = strings.Replace(s, "$", " ", -1)
|
|
s = strings.Replace(s, "%", " ", -1)
|
|
s = strings.Replace(s, "&", " ", -1)
|
|
s = strings.Replace(s, "(", " ", -1)
|
|
s = strings.Replace(s, ")", " ", -1)
|
|
s = strings.Replace(s, "*", " ", -1)
|
|
s = strings.Replace(s, "+", " ", -1)
|
|
s = strings.Replace(s, ",", " ", -1)
|
|
s = strings.Replace(s, "-", " ", -1)
|
|
s = strings.Replace(s, ".", " ", -1)
|
|
s = strings.Replace(s, "/", " ", -1)
|
|
s = strings.Replace(s, ":", " ", -1)
|
|
s = strings.Replace(s, ";", " ", -1)
|
|
s = strings.Replace(s, "<", " ", -1)
|
|
s = strings.Replace(s, "=", " ", -1)
|
|
s = strings.Replace(s, ">", " ", -1)
|
|
s = strings.Replace(s, "@", " ", -1)
|
|
s = strings.Replace(s, "[", " ", -1)
|
|
s = strings.Replace(s, "\"", " ", -1)
|
|
s = strings.Replace(s, "\\", " ", -1)
|
|
s = strings.Replace(s, "\n", " ", -1)
|
|
s = strings.Replace(s, "\t", " ", -1)
|
|
s = strings.Replace(s, "]", " ", -1)
|
|
s = strings.Replace(s, "^", " ", -1)
|
|
s = strings.Replace(s, "_", " ", -1)
|
|
s = strings.Replace(s, "{", " ", -1)
|
|
s = strings.Replace(s, "|", " ", -1)
|
|
s = strings.Replace(s, "}", " ", -1)
|
|
|
|
s = strings.Replace(s, "this is a multi part message in mime format", " ", -1)
|
|
s = strings.Replace(s, "nextpart", " ", -1)
|
|
s = strings.Replace(s, "content type", " ", -1)
|
|
s = strings.Replace(s, "text plain", " ", -1)
|
|
s = strings.Replace(s, "charset", " ", -1)
|
|
s = strings.Replace(s, "content transfer encoding", " ", -1)
|
|
s = strings.Replace(s, "quoted printable", " ", -1)
|
|
s = strings.Replace(s, "text html", " ", -1)
|
|
s = strings.Replace(s, "cp 850", " ", -1)
|
|
|
|
for i := 0; i < 10; i++ {
|
|
s = strings.Replace(s, " ", " ", -1)
|
|
}
|
|
|
|
return s, nil
|
|
}
|
|
|
|
// Clean prepares the mail's subject and body for training
|
|
func (m *Mail) Clean() error {
|
|
if m.Subject != nil {
|
|
s, err := cleanString(*m.Subject)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
m.Subject = &s
|
|
}
|
|
|
|
if m.Body != nil {
|
|
b, err := cleanString(*m.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
m.Body = &b
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Load reads a mail's subject and body
|
|
func (m *Mail) Load(d string) error {
|
|
|
|
message, err := maildir.Dir(d).Message(m.Key)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// get Subject
|
|
if m.Subject != nil {
|
|
return errors.New("there is already a subject")
|
|
}
|
|
subject := message.Header.Get("Subject")
|
|
m.Subject = &subject
|
|
|
|
// get Body
|
|
bQ := quotedprintable.NewReader(message.Body)
|
|
var b []string
|
|
bScanner := bufio.NewScanner(bQ)
|
|
for bScanner.Scan() {
|
|
raw := bScanner.Text()
|
|
b = append(b, raw)
|
|
}
|
|
|
|
body := strings.Join(b, " ")
|
|
if m.Body != nil {
|
|
return errors.New("there is already a body")
|
|
}
|
|
m.Body = &body
|
|
|
|
return nil
|
|
}
|
|
|
|
func main() {
|
|
// Get the Maildir to be handled
|
|
wd, err := os.Getwd()
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
maildir := flag.String("d", wd+"/Maildir", "Path of the Maildir to be handled")
|
|
flag.Parse()
|
|
|
|
// Load the Maildir content
|
|
mails, err := Index(*maildir)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
|
|
fmt.Println(mails)
|
|
|
|
// Create a classifier
|
|
//classifier := bayesian.NewClassifier(Good, Junk)
|
|
}
|