2017-04-15 20:23:26 +00:00
|
|
|
package sisyphus
|
2017-03-19 20:54:23 +00:00
|
|
|
|
|
|
|
import (
|
2017-05-10 03:45:11 +00:00
|
|
|
"os"
|
2018-01-11 20:35:27 +00:00
|
|
|
"path/filepath"
|
2017-03-19 20:54:23 +00:00
|
|
|
|
2017-05-18 19:21:46 +00:00
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
"github.com/boltdb/bolt"
|
2017-05-08 03:29:25 +00:00
|
|
|
"github.com/gonum/stat"
|
|
|
|
"github.com/retailnext/hllpp"
|
2017-03-19 20:54:23 +00:00
|
|
|
)
|
|
|
|
|
2017-05-08 03:29:25 +00:00
|
|
|
// classificationPrior returns the prior probabilities for good and junk
|
2017-03-19 20:54:23 +00:00
|
|
|
// classes.
|
2017-05-08 03:29:25 +00:00
|
|
|
func classificationPrior(db *bolt.DB) (g float64, err error) {
|
2017-03-19 20:54:23 +00:00
|
|
|
|
2017-05-15 22:26:44 +00:00
|
|
|
gTotal, jTotal, err := classificationStatistics(db)
|
|
|
|
if err != nil {
|
|
|
|
return g, err
|
|
|
|
}
|
2017-03-19 20:54:23 +00:00
|
|
|
|
2017-05-15 22:26:44 +00:00
|
|
|
return gTotal / (gTotal + jTotal), err
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|
|
|
|
|
2017-05-13 22:34:54 +00:00
|
|
|
// classificationLikelihoodWordcounts gets wordcounts from database to be used
|
|
|
|
// in Likelihood calculation
|
|
|
|
func classificationLikelihoodWordcounts(db *bolt.DB, word string) (gN, jN float64, err error) {
|
2017-05-08 03:29:25 +00:00
|
|
|
|
|
|
|
err = db.View(func(tx *bolt.Tx) error {
|
2017-03-19 20:54:23 +00:00
|
|
|
b := tx.Bucket([]byte("Wordlists"))
|
2017-05-08 03:29:25 +00:00
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
good := b.Bucket([]byte("Good"))
|
2017-05-08 03:29:25 +00:00
|
|
|
gWordRaw := good.Get([]byte(word))
|
2017-05-13 21:22:23 +00:00
|
|
|
if len(gWordRaw) > 0 {
|
2017-09-17 17:17:43 +00:00
|
|
|
var gWordHLL *hllpp.HLLPP
|
|
|
|
gWordHLL, err = hllpp.Unmarshal(gWordRaw)
|
2017-05-08 03:29:25 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-13 22:34:54 +00:00
|
|
|
gN = float64(gWordHLL.Count())
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
2017-03-19 20:54:23 +00:00
|
|
|
junk := b.Bucket([]byte("Junk"))
|
2017-05-08 03:29:25 +00:00
|
|
|
jWordRaw := junk.Get([]byte(word))
|
2017-05-13 21:22:23 +00:00
|
|
|
if len(jWordRaw) > 0 {
|
2017-09-17 17:17:43 +00:00
|
|
|
var jWordHLL *hllpp.HLLPP
|
|
|
|
jWordHLL, err = hllpp.Unmarshal(jWordRaw)
|
2017-05-08 03:29:25 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-13 22:34:54 +00:00
|
|
|
jN = float64(jWordHLL.Count())
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
|
|
|
|
2017-05-13 22:34:54 +00:00
|
|
|
return nil
|
|
|
|
})
|
|
|
|
|
|
|
|
return gN, jN, err
|
|
|
|
}
|
|
|
|
|
2017-05-15 22:26:44 +00:00
|
|
|
// classificationStatistics gets global statistics from database to
|
2017-05-13 22:34:54 +00:00
|
|
|
// be used in Likelihood calculation
|
2017-05-15 22:26:44 +00:00
|
|
|
func classificationStatistics(db *bolt.DB) (gTotal, jTotal float64, err error) {
|
2017-05-13 22:34:54 +00:00
|
|
|
|
|
|
|
err = db.View(func(tx *bolt.Tx) error {
|
2017-05-08 03:29:25 +00:00
|
|
|
p := tx.Bucket([]byte("Statistics"))
|
2017-05-13 21:22:23 +00:00
|
|
|
gRaw := p.Get([]byte("ProcessedGood"))
|
|
|
|
if len(gRaw) > 0 {
|
2017-09-17 17:17:43 +00:00
|
|
|
var gHLL *hllpp.HLLPP
|
|
|
|
gHLL, err = hllpp.Unmarshal(gRaw)
|
2017-05-13 21:22:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-13 22:34:54 +00:00
|
|
|
gTotal = float64(gHLL.Count())
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
2017-05-13 21:22:23 +00:00
|
|
|
jRaw := p.Get([]byte("ProcessedJunk"))
|
|
|
|
if len(jRaw) > 0 {
|
2017-09-17 17:17:43 +00:00
|
|
|
var jHLL *hllpp.HLLPP
|
|
|
|
jHLL, err = hllpp.Unmarshal(jRaw)
|
2017-05-13 21:22:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-13 22:34:54 +00:00
|
|
|
jTotal = float64(jHLL.Count())
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
|
|
|
|
2017-05-28 20:53:43 +00:00
|
|
|
if gTotal == 0 && jTotal == 0 {
|
2017-06-05 13:33:32 +00:00
|
|
|
log.Warning("no mails have yet been learned")
|
|
|
|
return nil
|
2017-05-28 20:53:43 +00:00
|
|
|
}
|
2017-05-08 03:29:25 +00:00
|
|
|
if gTotal == 0 {
|
2017-06-05 13:33:32 +00:00
|
|
|
log.Warning("no good mails have yet been learned")
|
|
|
|
return nil
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
|
|
|
if jTotal == 0 {
|
2017-06-05 13:33:32 +00:00
|
|
|
log.Warning("no junk mails have yet been learned")
|
|
|
|
return nil
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
|
|
|
|
2017-03-19 20:54:23 +00:00
|
|
|
return nil
|
|
|
|
})
|
|
|
|
|
2017-05-13 22:34:54 +00:00
|
|
|
return gTotal, jTotal, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// classificationLikelihood returns P(W|C_j) -- the probability of seeing a
|
|
|
|
// particular word W in a document of this class.
|
|
|
|
func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) {
|
|
|
|
|
|
|
|
gN, jN, err := classificationLikelihoodWordcounts(db, word)
|
|
|
|
if err != nil {
|
|
|
|
return g, j, err
|
|
|
|
}
|
|
|
|
|
2017-05-15 22:26:44 +00:00
|
|
|
gTotal, jTotal, err := classificationStatistics(db)
|
2017-05-13 22:34:54 +00:00
|
|
|
if err != nil {
|
|
|
|
return g, j, err
|
|
|
|
}
|
|
|
|
|
|
|
|
g = gN / gTotal
|
|
|
|
j = jN / jTotal
|
|
|
|
|
2017-05-10 17:23:25 +00:00
|
|
|
return g, j, err
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// classificationWord produces the conditional probability of a word belonging
|
|
|
|
// to good or junk using the classic Bayes' rule.
|
|
|
|
func classificationWord(db *bolt.DB, word string) (g float64, err error) {
|
|
|
|
|
|
|
|
priorG, err := classificationPrior(db)
|
|
|
|
if err != nil {
|
|
|
|
return g, err
|
|
|
|
}
|
|
|
|
|
|
|
|
likelihoodG, likelihoodJ, err := classificationLikelihood(db, word)
|
|
|
|
if err != nil {
|
|
|
|
return g, err
|
|
|
|
}
|
|
|
|
|
|
|
|
g = (likelihoodG * priorG) / (likelihoodG*priorG + likelihoodJ*(1-priorG))
|
|
|
|
|
|
|
|
return g, nil
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|
|
|
|
|
2017-05-10 03:45:11 +00:00
|
|
|
// Classify analyses a new mail (a mail that arrived in the "new" directory),
|
|
|
|
// decides whether it is junk and -- if so -- moves it to the Junk folder. If
|
|
|
|
// it is not junk, the mail is untouched so it can be handled by the mail
|
|
|
|
// client.
|
2017-06-05 13:33:32 +00:00
|
|
|
func (m *Mail) Classify(db *bolt.DB, dir Maildir) (err error) {
|
|
|
|
|
|
|
|
m.New = true
|
|
|
|
|
|
|
|
err = m.Load(dir)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-10 03:45:11 +00:00
|
|
|
|
2017-05-13 22:34:54 +00:00
|
|
|
list, err := m.cleanWordlist()
|
2017-05-10 17:23:25 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-06-05 13:33:32 +00:00
|
|
|
junk, prob, err := Junk(db, list)
|
2017-05-10 03:45:11 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-06-05 13:33:32 +00:00
|
|
|
m.Junk = junk
|
|
|
|
|
2017-05-23 20:43:17 +00:00
|
|
|
log.WithFields(log.Fields{
|
2017-06-05 13:33:32 +00:00
|
|
|
"mail": m.Key,
|
|
|
|
"junk": m.Junk,
|
|
|
|
"probability": prob,
|
|
|
|
"dir": string(dir),
|
2017-05-23 20:43:17 +00:00
|
|
|
}).Info("Classified")
|
2017-05-10 03:45:11 +00:00
|
|
|
|
|
|
|
// Move mail around if junk.
|
|
|
|
if junk {
|
2018-01-11 20:35:27 +00:00
|
|
|
err = os.Rename(filepath.Join(string(dir), "new", m.Key), filepath.Join(string(dir), ".Junk", "cur", m.Key))
|
2017-05-10 03:45:11 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-05-23 20:43:17 +00:00
|
|
|
log.WithFields(log.Fields{
|
|
|
|
"mail": m.Key,
|
|
|
|
}).Info("Moved to Junk folder")
|
2017-05-10 03:45:11 +00:00
|
|
|
}
|
|
|
|
|
2017-09-16 22:56:17 +00:00
|
|
|
err = m.Unload(dir)
|
|
|
|
|
|
|
|
return err
|
2017-05-10 03:45:11 +00:00
|
|
|
}
|
|
|
|
|
2017-05-08 03:29:25 +00:00
|
|
|
// Junk returns true if the wordlist is classified as a junk mail using Bayes'
|
2017-05-13 21:22:23 +00:00
|
|
|
// rule. If required, it also returns the calculated probability of being junk,
|
|
|
|
// but this is typically not needed.
|
|
|
|
func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) {
|
2017-05-08 03:29:25 +00:00
|
|
|
var probabilities []float64
|
|
|
|
|
2017-06-05 13:33:32 +00:00
|
|
|
// initial value should be no junk
|
|
|
|
prob = 1.0
|
|
|
|
|
2017-05-08 03:29:25 +00:00
|
|
|
for _, val := range wordlist {
|
2017-05-28 20:53:43 +00:00
|
|
|
var p float64
|
|
|
|
p, err = classificationWord(db, val)
|
2017-05-08 03:29:25 +00:00
|
|
|
if err != nil {
|
2017-05-28 20:53:43 +00:00
|
|
|
return false, 0.0, err
|
2017-05-08 03:29:25 +00:00
|
|
|
}
|
|
|
|
probabilities = append(probabilities, p)
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|
|
|
|
|
2017-05-27 20:32:23 +00:00
|
|
|
if len(probabilities) > 0 {
|
|
|
|
prob = stat.HarmonicMean(probabilities, nil)
|
2017-05-28 20:53:43 +00:00
|
|
|
}
|
|
|
|
if prob < 0.5 {
|
|
|
|
return true, (1 - prob), err
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|
|
|
|
|
2017-05-28 20:53:43 +00:00
|
|
|
return false, (1 - prob), err
|
2017-03-19 20:54:23 +00:00
|
|
|
}
|