add a counter bucket to processed, move some stuff out of main, clean up

mail, create bayesian updater
master
Carlo Strub 7 years ago
parent ca967c197a
commit 0afac8f4c8

@ -0,0 +1,100 @@
/*
Part of this code is borrowed from github.com/jbrukh/bayesian published under a BSD3CLAUSE License
*/
package main
import (
"math"
"strconv"
"github.com/boltdb/bolt"
)
// classificationPriors returns the prior probabilities for good and junk
// classes.
func classificationPriors(db *bolt.DB) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gN := good.Stats().KeyN
junk := b.Bucket([]byte("Junk"))
jN := junk.Stats().KeyN
g = float64(gN) / (float64(gN) + float64(jN))
j = float64(jN) / (float64(gN) + float64(jN))
return nil
})
return
}
// classificationWordProb returns P(W|C_j) -- the probability of seeing
// a particular word W in a document of this class.
func classificationWordProb(db *bolt.DB, word string) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gNString := string(good.Get([]byte(word)))
gN, _ := strconv.ParseFloat(gNString, 64)
junk := b.Bucket([]byte("Junk"))
jNString := string(junk.Get([]byte(word)))
jN, _ := strconv.ParseFloat(jNString, 64)
p := tx.Bucket([]byte("Processed"))
counters := p.Bucket([]byte("Counters"))
jString := string(counters.Get([]byte("Junk")))
j, _ := strconv.ParseFloat(jString, 64)
mails := p.Bucket([]byte("Mails"))
pN := mails.Stats().KeyN
g = gN / (float64(pN) - j)
j = jN / j
return nil
})
return g, j
}
// LogScores produces "log-likelihood"-like scores that can
// be used to classify documents into classes.
//
// The value of the score is proportional to the likelihood,
// as determined by the classifier, that the given document
// belongs to the given class. This is true even when scores
// returned are negative, which they will be (since we are
// taking logs of probabilities).
//
// The index j of the score corresponds to the class given
// by c.Classes[j].
//
// Additionally returned are "inx" and "strict" values. The
// inx corresponds to the maximum score in the array. If more
// than one of the scores holds the maximum values, then
// strict is false.
//
// Unlike c.Probabilities(), this function is not prone to
// floating point underflow and is relatively safe to use.
func LogScores(db *bolt.DB, wordlist []string) (scoreG, scoreJ float64, junk bool) {
priorG, priorJ := classificationPriors(db)
// calculate the scores
scoreG = math.Log(priorG)
scoreJ = math.Log(priorJ)
for _, word := range wordlist {
gP, jP := classificationWordProb(db, word)
scoreG += math.Log(gP)
scoreJ += math.Log(jP)
}
if scoreJ == math.Max(scoreG, scoreJ) {
junk = true
}
return scoreG, scoreJ, junk
}

@ -1,12 +1,15 @@
package main
import (
"log"
"github.com/boltdb/bolt"
)
// openDB creates and opens a new database and its respective buckets (if required)
func openDB(maildir string) (db *bolt.DB, err error) {
log.Println("loading database")
// Open the sisyphus.db data file in your current directory.
// It will be created if it doesn't exist.
db, err = bolt.Open(maildir+"/sisyphus.db", 0600, nil)
@ -26,6 +29,32 @@ func openDB(maildir string) (db *bolt.DB, err error) {
return db, err
}
// Create DB bucket for Mails inside bucket Processed
err = db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
_, err := b.CreateBucketIfNotExists([]byte("Mails"))
if err != nil {
return err
}
return nil
})
if err != nil {
return db, err
}
// Create DB bucket for Counters inside bucket Processed
err = db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
_, err := b.CreateBucketIfNotExists([]byte("Counters"))
if err != nil {
return err
}
return nil
})
if err != nil {
return db, err
}
// Create DB bucket for word lists
err = db.Update(func(tx *bolt.Tx) error {
_, err := tx.CreateBucketIfNotExists([]byte("Wordlists"))
@ -61,5 +90,6 @@ func openDB(maildir string) (db *bolt.DB, err error) {
return nil
})
log.Println("database loaded")
return db, err
}

@ -3,8 +3,10 @@ package main
import (
"bufio"
"errors"
"log"
"math"
"mime/quotedprintable"
"os"
"regexp"
"strings"
@ -19,9 +21,21 @@ type Mail struct {
Junk bool
}
// CreateDirs creates all the required dirs -- if not already there.
func CreateDirs(maildir string) {
log.Println("create missing directories")
os.MkdirAll(maildir+"/.Junk/cur", 0700)
os.MkdirAll(maildir+"/new", 0700)
os.MkdirAll(maildir+"/cur", 0700)
return
}
// Index loads all mail keys from the Maildir directory for processing.
func Index(d string) (m []*Mail, err error) {
log.Println("loading mails")
dirs := []string{d, d + "/.Junk"}
for _, dir := range dirs {
j, err := maildir.Dir(dir).Keys()
@ -38,9 +52,46 @@ func Index(d string) (m []*Mail, err error) {
}
}
log.Println("mails loaded")
return m, nil
}
// Load reads a mail's subject and body
func (m *Mail) Load(d string) error {
if m.Junk {
d = d + "/.Junk"
}
message, err := maildir.Dir(d).Message(m.Key)
if err != nil {
return err
}
// get Subject
if m.Subject != nil {
return errors.New("there is already a subject")
}
subject := message.Header.Get("Subject")
m.Subject = &subject
// get Body
bQ := quotedprintable.NewReader(message.Body)
var b []string
bScanner := bufio.NewScanner(bQ)
for bScanner.Scan() {
raw := bScanner.Text()
b = append(b, raw)
}
body := strings.Join(b, " ")
if m.Body != nil {
return errors.New("there is already a body")
}
m.Body = &body
return nil
}
func trimStringFromBase64(s string) string {
if idx := strings.Index(s, "Content-Transfer-Encoding: base64"); idx != -1 {
return s[:idx-1]
@ -48,64 +99,46 @@ func trimStringFromBase64(s string) string {
return s
}
func cleanString(i string) (s string, err error) {
func cleanString(i string) (s string) {
s = trimStringFromBase64(i)
s = sanitize.Accents(s)
s = sanitize.Accents(i)
s = sanitize.HTML(s)
s = strings.ToLower(s)
s = strings.Replace(s, "boundary=", " ", -1)
s = strings.Replace(s, "charset", " ", -1)
s = strings.Replace(s, "content-transfer-encoding", " ", -1)
s = strings.Replace(s, "content-type", " ", -1)
s = strings.Replace(s, "image/jpeg", " ", -1)
s = strings.Replace(s, "multipart/alternative", " ", -1)
s = strings.Replace(s, "multipart/related", " ", -1)
s = strings.Replace(s, "name=", " ", -1)
s = strings.Replace(s, "nextpart", " ", -1)
s = strings.Replace(s, "quoted-printable", " ", -1)
s = strings.Replace(s, "text/html", " ", -1)
s = strings.Replace(s, "text/plain", " ", -1)
s = strings.Replace(s, "this email must be viewed in html mode", " ", -1)
s = strings.Replace(s, "this is a multi-part message in mime format", " ", -1)
s = strings.Replace(s, "windows-1251", " ", -1)
s = strings.Replace(s, "windows-1252", " ", -1)
s = strings.Replace(s, "!", " ", -1)
s = strings.Replace(s, "#", " ", -1)
s = strings.Replace(s, "$", " ", -1)
s = strings.Replace(s, "%", " ", -1)
s = strings.Replace(s, "&", " ", -1)
s = strings.Replace(s, "'", "", -1)
s = strings.Replace(s, "(", " ", -1)
s = strings.Replace(s, ")", " ", -1)
s = strings.Replace(s, "*", " ", -1)
s = strings.Replace(s, "+", " ", -1)
s = strings.Replace(s, ",", " ", -1)
s = strings.Replace(s, ". ", " ", -1)
s = strings.Replace(s, "<", " ", -1)
s = strings.Replace(s, "=", " ", -1)
s = strings.Replace(s, ">", " ", -1)
s = strings.Replace(s, "?", " ", -1)
s = strings.Replace(s, "@", " ", -1)
s = strings.Replace(s, "[", " ", -1)
s = strings.Replace(s, "\"", " ", -1)
s = strings.Replace(s, "\\", " ", -1)
s = strings.Replace(s, "\n", " ", -1)
s = strings.Replace(s, "\t", " ", -1)
s = strings.Replace(s, "]", " ", -1)
s = strings.Replace(s, "^", " ", -1)
s = strings.Replace(s, "_", " ", -1)
s = strings.Replace(s, "{", " ", -1)
s = strings.Replace(s, "|", " ", -1)
s = strings.Replace(s, "}", " ", -1)
bad := []string{
"boundary=", "charset", "content-transfer-encoding",
"content-type", "image/jpeg", "multipart/alternative",
"multipart/related", "name=", "nextpart", "quoted-printable",
"text/html", "text/plain", "this email must be viewed in html mode",
"this is a multi-part message in mime format",
"windows-1251", "windows-1252", "!", "#", "$", "%", "&", "'",
"(", ")", "*", "+", ",", ". ", "<", "=", ">", "?", "@", "[",
"\"", "\\", "\n", "\t", "]", "^", "_", "{", "|", "}",
}
for _, b := range bad {
s = strings.Replace(s, b, " ", -1)
}
for i := 0; i < 10; i++ {
s = strings.Replace(s, " ", " ", -1)
}
return s, nil
return s
}
// Clean cleans the mail's subject and body
func (m *Mail) Clean() error {
if m.Subject != nil {
s := trimStringFromBase64(*m.Subject)
s = cleanString(s)
m.Subject = &s
}
if m.Body != nil {
b := trimStringFromBase64(*m.Body)
b = cleanString(b)
m.Body = &b
}
return nil
}
// wordlist takes a string of space separated text and returns a list of unique
@ -151,26 +184,6 @@ func wordlist(s string) (l []string, err error) {
return l, nil
}
// Clean cleans the mail's subject and body
func (m *Mail) Clean() error {
if m.Subject != nil {
s, err := cleanString(*m.Subject)
if err != nil {
return err
}
m.Subject = &s
}
if m.Body != nil {
b, err := cleanString(*m.Body)
if err != nil {
return err
}
m.Body = &b
}
return nil
}
// Wordlists prepares the mail's subject and body for training
func (m *Mail) Wordlists() (subject, body []string, err error) {
if m.Subject != nil {
@ -190,44 +203,14 @@ func (m *Mail) Wordlists() (subject, body []string, err error) {
return subject, body, nil
}
// Load reads a mail's subject and body
func (m *Mail) Load(d string) error {
if m.Junk {
d = d + "/.Junk"
}
message, err := maildir.Dir(d).Message(m.Key)
if err != nil {
return err
}
// get Subject
if m.Subject != nil {
return errors.New("there is already a subject")
}
subject := message.Header.Get("Subject")
m.Subject = &subject
// get Body
bQ := quotedprintable.NewReader(message.Body)
var b []string
bScanner := bufio.NewScanner(bQ)
for bScanner.Scan() {
raw := bScanner.Text()
b = append(b, raw)
}
body := strings.Join(b, " ")
if m.Body != nil {
return errors.New("there is already a body")
}
m.Body = &body
// Classify analyses the mail and decides whether it is Junk or Good
func (m *Mail) Classify() error {
return nil
}
// Classify identifies whether a mail is junk and then learns its words for the
// respective category
func (m *Mail) Classify() error {
// Learn adds the words to the respective list and unlearns on the other, if
// the mail has been moved from there.
func (m *Mail) Learn() error {
return nil
}

@ -419,7 +419,7 @@ var _ = Describe("Mail", func() {
Ω(subject).Should(Equal(
[]string{"eyes", "glasses", "headed", "serious", "trouble", "wear", "your"}))
Ω(body).Should(Equal(
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "ferdinand", "floor", "from", "full", "glasses", "goodness", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "running", "self", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "youll", "your"}))
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "ferdinand", "floor", "from", "full", "glasses", "goodness", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "rothschild", "running", "self", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "your"}))
})
It("Wordlist 6", func() {
@ -444,7 +444,7 @@ var _ = Describe("Mail", func() {
Ω(subject).Should(Equal(
[]string{"always", "form", "good", "super", "viagra", "with"}))
Ω(body).Should(Equal(
[]string{"amazon", "antiviral", "blockquote", "blood", "body", "canada", "cant", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "table", "terry", "these", "this", "time", "trademark", "united", "various", "view", "when", "wish", "with", "your"}))
[]string{"amazon", "antiviral", "blockquote", "blood", "body", "canada", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "table", "terry", "these", "this", "time", "trademark", "united", "various", "view", "when", "wish", "with", "your"}))
})
})
})

@ -5,6 +5,7 @@ import (
"log"
"os"
"os/signal"
"strings"
"syscall"
"github.com/boltdb/bolt"
@ -115,40 +116,46 @@ func main() {
log.Fatal("Sorry... only one Maildir supported as of today.")
}
log.Println("create directories if missing")
os.MkdirAll(maildirPaths[0]+"/.Junk/cur", 0700)
os.MkdirAll(maildirPaths[0]+"/new", 0700)
os.MkdirAll(maildirPaths[0]+"/cur", 0700)
CreateDirs(maildirPaths[0])
log.Println("loading mails")
mails, err := Index(maildirPaths[0])
if err != nil {
log.Fatal("Wrong path to Maildir")
}
log.Println("mails loaded")
// Open the database
log.Println("loading database")
db, err := openDB(maildirPaths[0])
if err != nil {
log.Fatal(err)
}
defer db.Close()
log.Println("database loaded")
// Handle all mails initially
// Handle all mails after startup
for i := range mails {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
v := b.Get([]byte(mails[i].Key))
if len(v) == 0 {
mails[i].Classify()
err = mails[i].Classify()
if err != nil {
log.Print(err)
}
err = mails[i].Learn()
if err != nil {
log.Print(err)
}
}
if string(v) == good && mails[i].Junk == true {
mails[i].Classify()
err = mails[i].Learn()
if err != nil {
log.Print(err)
}
}
if string(v) == junk && mails[i].Junk == false {
mails[i].Classify()
err = mails[i].Learn()
if err != nil {
log.Print(err)
}
}
return nil
})
@ -167,12 +174,19 @@ func main() {
select {
case event := <-watcher.Events:
if event.Op&fsnotify.Create == fsnotify.Create {
log.Println("new mail:", event.Name)
m := s.Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730",
mailName := strings.Split(event.Name, "/")
m := Mail{
Key: mailName[len(mailName)-1],
}
err := m.Classify()
err = m.Classify()
if err != nil {
log.Print(err)
}
err = m.Learn()
if err != nil {
log.Print(err)
}
}
case err := <-watcher.Errors:

Loading…
Cancel
Save