2
0
mirror of https://github.com/carlostrub/sisyphus synced 2024-10-31 09:20:15 +00:00
sisyphus/classify.go

199 lines
4.6 KiB
Go
Raw Normal View History

2017-04-15 20:23:26 +00:00
package sisyphus
import (
2017-05-08 03:29:25 +00:00
"errors"
"log"
"os"
"strconv"
"github.com/boltdb/bolt"
2017-05-08 03:29:25 +00:00
"github.com/gonum/stat"
"github.com/retailnext/hllpp"
)
2017-05-08 03:29:25 +00:00
// classificationPrior returns the prior probabilities for good and junk
// classes.
2017-05-08 03:29:25 +00:00
func classificationPrior(db *bolt.DB) (g float64, err error) {
2017-05-08 03:29:25 +00:00
err = db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
2017-05-08 03:29:25 +00:00
good := b.Bucket([]byte("Good"))
2017-03-19 21:43:14 +00:00
gN := float64(good.Stats().KeyN)
2017-05-08 03:29:25 +00:00
junk := b.Bucket([]byte("Junk"))
2017-03-19 21:43:14 +00:00
jN := float64(junk.Stats().KeyN)
2017-05-08 03:29:25 +00:00
// division by zero means there are no learned mails so far
if (gN + jN) == 0 {
return errors.New("no mails have been classified so far")
}
2017-03-19 21:43:14 +00:00
g = gN / (gN + jN)
return nil
})
2017-05-08 03:29:25 +00:00
return g, err
}
2017-05-13 22:34:54 +00:00
// classificationLikelihoodWordcounts gets wordcounts from database to be used
// in Likelihood calculation
func classificationLikelihoodWordcounts(db *bolt.DB, word string) (gN, jN float64, err error) {
2017-05-08 03:29:25 +00:00
err = db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
2017-05-08 03:29:25 +00:00
good := b.Bucket([]byte("Good"))
2017-05-08 03:29:25 +00:00
gWordRaw := good.Get([]byte(word))
2017-05-13 21:22:23 +00:00
if len(gWordRaw) > 0 {
2017-05-08 03:29:25 +00:00
gWordHLL, err := hllpp.Unmarshal(gWordRaw)
if err != nil {
return err
}
2017-05-13 22:34:54 +00:00
gN = float64(gWordHLL.Count())
2017-05-08 03:29:25 +00:00
}
junk := b.Bucket([]byte("Junk"))
2017-05-08 03:29:25 +00:00
jWordRaw := junk.Get([]byte(word))
2017-05-13 21:22:23 +00:00
if len(jWordRaw) > 0 {
2017-05-08 03:29:25 +00:00
jWordHLL, err := hllpp.Unmarshal(jWordRaw)
if err != nil {
return err
}
2017-05-13 22:34:54 +00:00
jN = float64(jWordHLL.Count())
2017-05-08 03:29:25 +00:00
}
2017-05-13 22:34:54 +00:00
return nil
})
return gN, jN, err
}
// classificationLikelihoodStatistics gets global statistics from database to
// be used in Likelihood calculation
func classificationLikelihoodStatistics(db *bolt.DB, word string) (gTotal, jTotal float64, err error) {
err = db.View(func(tx *bolt.Tx) error {
2017-05-08 03:29:25 +00:00
p := tx.Bucket([]byte("Statistics"))
2017-05-13 21:22:23 +00:00
gRaw := p.Get([]byte("ProcessedGood"))
if len(gRaw) > 0 {
gHLL, err := hllpp.Unmarshal(gRaw)
if err != nil {
return err
}
2017-05-13 22:34:54 +00:00
gTotal = float64(gHLL.Count())
2017-05-08 03:29:25 +00:00
}
2017-05-13 21:22:23 +00:00
jRaw := p.Get([]byte("ProcessedJunk"))
if len(jRaw) > 0 {
jHLL, err := hllpp.Unmarshal(jRaw)
if err != nil {
return err
}
2017-05-13 22:34:54 +00:00
jTotal = float64(jHLL.Count())
2017-05-08 03:29:25 +00:00
}
if gTotal == 0 {
2017-05-13 21:22:23 +00:00
return errors.New("no good mails have yet been classified")
2017-05-08 03:29:25 +00:00
}
if jTotal == 0 {
2017-05-13 21:22:23 +00:00
return errors.New("no junk mails have yet been classified")
2017-05-08 03:29:25 +00:00
}
return nil
})
2017-05-13 22:34:54 +00:00
return gTotal, jTotal, err
}
// classificationLikelihood returns P(W|C_j) -- the probability of seeing a
// particular word W in a document of this class.
func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) {
gN, jN, err := classificationLikelihoodWordcounts(db, word)
if err != nil {
return g, j, err
}
gTotal, jTotal, err := classificationLikelihoodStatistics(db, word)
if err != nil {
return g, j, err
}
g = gN / gTotal
j = jN / jTotal
2017-05-10 17:23:25 +00:00
return g, j, err
2017-05-08 03:29:25 +00:00
}
// classificationWord produces the conditional probability of a word belonging
// to good or junk using the classic Bayes' rule.
func classificationWord(db *bolt.DB, word string) (g float64, err error) {
priorG, err := classificationPrior(db)
if err != nil {
return g, err
}
likelihoodG, likelihoodJ, err := classificationLikelihood(db, word)
if err != nil {
return g, err
}
g = (likelihoodG * priorG) / (likelihoodG*priorG + likelihoodJ*(1-priorG))
return g, nil
}
// Classify analyses a new mail (a mail that arrived in the "new" directory),
// decides whether it is junk and -- if so -- moves it to the Junk folder. If
// it is not junk, the mail is untouched so it can be handled by the mail
// client.
func (m *Mail) Classify(db *bolt.DB) error {
2017-05-13 22:34:54 +00:00
list, err := m.cleanWordlist()
2017-05-10 17:23:25 +00:00
if err != nil {
return err
}
2017-05-13 21:22:23 +00:00
junk, _, err := Junk(db, list)
if err != nil {
return err
}
log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk))
// Move mail around if junk.
if junk {
m.Junk = junk
err := os.Rename("./new/"+m.Key, "./.Junk/cur/"+m.Key)
if err != nil {
return err
}
log.Print("Moved " + m.Key + " from new to Junk folder")
}
return nil
}
2017-05-08 03:29:25 +00:00
// Junk returns true if the wordlist is classified as a junk mail using Bayes'
2017-05-13 21:22:23 +00:00
// rule. If required, it also returns the calculated probability of being junk,
// but this is typically not needed.
func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) {
2017-05-08 03:29:25 +00:00
var probabilities []float64
for _, val := range wordlist {
p, err := classificationWord(db, val)
if err != nil {
2017-05-13 21:22:23 +00:00
return false, prob, err
2017-05-08 03:29:25 +00:00
}
probabilities = append(probabilities, p)
}
2017-05-13 21:22:23 +00:00
prob = stat.HarmonicMean(probabilities, nil)
if prob < 0.5 {
return true, (1 - prob), nil
}
2017-05-13 21:22:23 +00:00
return false, (1 - prob), nil
}