sisyphus/classify.go

package sisyphus

import (
	"os"
	"path/filepath"

	log "github.com/sirupsen/logrus"

	"github.com/boltdb/bolt"
	"github.com/gonum/stat"
	"github.com/retailnext/hllpp"
)

// classificationPrior returns the prior probabilities for good and junk
// classes.
func classificationPrior(db *bolt.DB) (g float64, err error) {

	gTotal, jTotal, err := classificationStatistics(db)
	if err != nil {
		return g, err
	}

	return gTotal / (gTotal + jTotal), err
}

// classificationLikelihoodWordcounts gets wordcounts from database to be used
// in Likelihood calculation
func classificationLikelihoodWordcounts(db *bolt.DB, word string) (gN, jN float64, err error) {

	err = db.View(func(tx *bolt.Tx) error {
		b := tx.Bucket([]byte("Wordlists"))

		good := b.Bucket([]byte("Good"))
		gWordRaw := good.Get([]byte(word))
		if len(gWordRaw) > 0 {
			var gWordHLL *hllpp.HLLPP
			gWordHLL, err = hllpp.Unmarshal(gWordRaw)
			if err != nil {
				return err
			}
			gN = float64(gWordHLL.Count())
		}
		junk := b.Bucket([]byte("Junk"))
		jWordRaw := junk.Get([]byte(word))
		if len(jWordRaw) > 0 {
			var jWordHLL *hllpp.HLLPP
			jWordHLL, err = hllpp.Unmarshal(jWordRaw)
			if err != nil {
				return err
			}
			jN = float64(jWordHLL.Count())
		}

		return nil
	})

	return gN, jN, err
}

// classificationStatistics gets global statistics from database to
// be used in Likelihood calculation
func classificationStatistics(db *bolt.DB) (gTotal, jTotal float64, err error) {

	err = db.View(func(tx *bolt.Tx) error {
		p := tx.Bucket([]byte("Statistics"))
		gRaw := p.Get([]byte("ProcessedGood"))
		if len(gRaw) > 0 {
			var gHLL *hllpp.HLLPP
			gHLL, err = hllpp.Unmarshal(gRaw)
			if err != nil {
				return err
			}
			gTotal = float64(gHLL.Count())
		}
		jRaw := p.Get([]byte("ProcessedJunk"))
		if len(jRaw) > 0 {
			var jHLL *hllpp.HLLPP
			jHLL, err = hllpp.Unmarshal(jRaw)
			if err != nil {
				return err
			}
			jTotal = float64(jHLL.Count())
		}

		if gTotal == 0 && jTotal == 0 {
			log.Warning("no mails have yet been learned")
			return nil
		}
		if gTotal == 0 {
			log.Warning("no good mails have yet been learned")
			return nil
		}
		if jTotal == 0 {
			log.Warning("no junk mails have yet been learned")
			return nil
		}

		return nil
	})

	return gTotal, jTotal, err
}

// classificationLikelihood returns P(W|C_j) -- the probability of seeing a
// particular word W in a document of this class.
func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) {

	gN, jN, err := classificationLikelihoodWordcounts(db, word)
	if err != nil {
		return g, j, err
	}

	gTotal, jTotal, err := classificationStatistics(db)
	if err != nil {
		return g, j, err
	}

	g = gN / gTotal
	j = jN / jTotal

	return g, j, err
}

// classificationWord produces the conditional probability of a word belonging
// to good or junk using the classic Bayes' rule.
func classificationWord(db *bolt.DB, word string) (g float64, err error) {

	priorG, err := classificationPrior(db)
	if err != nil {
		return g, err
	}

	likelihoodG, likelihoodJ, err := classificationLikelihood(db, word)
	if err != nil {
		return g, err
	}

	g = (likelihoodG * priorG) / (likelihoodG*priorG + likelihoodJ*(1-priorG))

	return g, nil
}

// Classify analyses a new mail (a mail that arrived in the "new" directory),
// decides whether it is junk and -- if so -- moves it to the Junk folder. If
// it is not junk, the mail is untouched so it can be handled by the mail
// client.
func (m *Mail) Classify(db *bolt.DB, dir Maildir) (err error) {

	m.New = true

	err = m.Load(dir)
	if err != nil {
		return err
	}

	list, err := m.cleanWordlist()
	if err != nil {
		return err
	}

	junk, prob, err := Junk(db, list)
	if err != nil {
		return err
	}

	m.Junk = junk

	log.WithFields(log.Fields{
		"mail":        m.Key,
		"junk":        m.Junk,
		"probability": prob,
		"dir":         string(dir),
	}).Info("Classified")

	// Move mail around if junk.
	if junk {
		err = os.Rename(filepath.Join(string(dir), "new", m.Key), filepath.Join(string(dir), ".Junk", "cur", m.Key))
		if err != nil {
			return err
		}
		log.WithFields(log.Fields{
			"mail": m.Key,
		}).Info("Moved to Junk folder")
	}

	err = m.Unload(dir)

	return err
}

// Junk returns true if the wordlist is classified as a junk mail using Bayes'
// rule. If required, it also returns the calculated probability of being junk,
// but this is typically not needed.
func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) {
	var probabilities []float64

	// initial value should be no junk
	prob = 1.0

	for _, val := range wordlist {
		var p float64
		p, err = classificationWord(db, val)
		if err != nil {
			return false, 0.0, err
		}
		probabilities = append(probabilities, p)
	}

	if len(probabilities) > 0 {
		prob = stat.HarmonicMean(probabilities, nil)
	}
	if prob < 0.5 {
		return true, (1 - prob), err
	}

	return false, (1 - prob), err
}
separate out command and package 2017-04-15 20:23:26 +00:00			`package sisyphus`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
			`import (`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00			`"os"`
Use path/filepath for cleaner and safer path generation 2018-01-11 20:35:27 +00:00			`"path/filepath"`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
replace log by logrus 2017-05-18 19:21:46 +00:00			`log "github.com/sirupsen/logrus"`

add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`"github.com/boltdb/bolt"`
many new things... 2017-05-08 03:29:25 +00:00			`"github.com/gonum/stat"`
			`"github.com/retailnext/hllpp"`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`)`

many new things... 2017-05-08 03:29:25 +00:00			`// classificationPrior returns the prior probabilities for good and junk`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`// classes.`
many new things... 2017-05-08 03:29:25 +00:00			`func classificationPrior(db *bolt.DB) (g float64, err error) {`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
wrong prior calculation 2017-05-15 22:26:44 +00:00			`gTotal, jTotal, err := classificationStatistics(db)`
			`if err != nil {`
			`return g, err`
			`}`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
wrong prior calculation 2017-05-15 22:26:44 +00:00			`return gTotal / (gTotal + jTotal), err`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`}`

make gometalinter happier 2017-05-13 22:34:54 +00:00			`// classificationLikelihoodWordcounts gets wordcounts from database to be used`
			`// in Likelihood calculation`
			`func classificationLikelihoodWordcounts(db *bolt.DB, word string) (gN, jN float64, err error) {`
many new things... 2017-05-08 03:29:25 +00:00
			`err = db.View(func(tx *bolt.Tx) error {`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`b := tx.Bucket([]byte("Wordlists"))`
many new things... 2017-05-08 03:29:25 +00:00
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`good := b.Bucket([]byte("Good"))`
many new things... 2017-05-08 03:29:25 +00:00			`gWordRaw := good.Get([]byte(word))`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`if len(gWordRaw) > 0 {`
please gometalinter 2017-09-17 17:17:43 +00:00			`var gWordHLL *hllpp.HLLPP`
			`gWordHLL, err = hllpp.Unmarshal(gWordRaw)`
many new things... 2017-05-08 03:29:25 +00:00			`if err != nil {`
			`return err`
			`}`
make gometalinter happier 2017-05-13 22:34:54 +00:00			`gN = float64(gWordHLL.Count())`
many new things... 2017-05-08 03:29:25 +00:00			`}`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`junk := b.Bucket([]byte("Junk"))`
many new things... 2017-05-08 03:29:25 +00:00			`jWordRaw := junk.Get([]byte(word))`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`if len(jWordRaw) > 0 {`
please gometalinter 2017-09-17 17:17:43 +00:00			`var jWordHLL *hllpp.HLLPP`
			`jWordHLL, err = hllpp.Unmarshal(jWordRaw)`
many new things... 2017-05-08 03:29:25 +00:00			`if err != nil {`
			`return err`
			`}`
make gometalinter happier 2017-05-13 22:34:54 +00:00			`jN = float64(jWordHLL.Count())`
many new things... 2017-05-08 03:29:25 +00:00			`}`

make gometalinter happier 2017-05-13 22:34:54 +00:00			`return nil`
			`})`

			`return gN, jN, err`
			`}`

wrong prior calculation 2017-05-15 22:26:44 +00:00			`// classificationStatistics gets global statistics from database to`
make gometalinter happier 2017-05-13 22:34:54 +00:00			`// be used in Likelihood calculation`
wrong prior calculation 2017-05-15 22:26:44 +00:00			`func classificationStatistics(db *bolt.DB) (gTotal, jTotal float64, err error) {`
make gometalinter happier 2017-05-13 22:34:54 +00:00
			`err = db.View(func(tx *bolt.Tx) error {`
many new things... 2017-05-08 03:29:25 +00:00			`p := tx.Bucket([]byte("Statistics"))`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`gRaw := p.Get([]byte("ProcessedGood"))`
			`if len(gRaw) > 0 {`
please gometalinter 2017-09-17 17:17:43 +00:00			`var gHLL *hllpp.HLLPP`
			`gHLL, err = hllpp.Unmarshal(gRaw)`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`if err != nil {`
			`return err`
			`}`
make gometalinter happier 2017-05-13 22:34:54 +00:00			`gTotal = float64(gHLL.Count())`
many new things... 2017-05-08 03:29:25 +00:00			`}`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`jRaw := p.Get([]byte("ProcessedJunk"))`
			`if len(jRaw) > 0 {`
please gometalinter 2017-09-17 17:17:43 +00:00			`var jHLL *hllpp.HLLPP`
			`jHLL, err = hllpp.Unmarshal(jRaw)`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`if err != nil {`
			`return err`
			`}`
make gometalinter happier 2017-05-13 22:34:54 +00:00			`jTotal = float64(jHLL.Count())`
many new things... 2017-05-08 03:29:25 +00:00			`}`

classify also if nothing have ever been learned 2017-05-28 20:53:43 +00:00			`if gTotal == 0 && jTotal == 0 {`
glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`log.Warning("no mails have yet been learned")`
			`return nil`
classify also if nothing have ever been learned 2017-05-28 20:53:43 +00:00			`}`
many new things... 2017-05-08 03:29:25 +00:00			`if gTotal == 0 {`
glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`log.Warning("no good mails have yet been learned")`
			`return nil`
many new things... 2017-05-08 03:29:25 +00:00			`}`
			`if jTotal == 0 {`
glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`log.Warning("no junk mails have yet been learned")`
			`return nil`
many new things... 2017-05-08 03:29:25 +00:00			`}`

add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`return nil`
			`})`

make gometalinter happier 2017-05-13 22:34:54 +00:00			`return gTotal, jTotal, err`
			`}`

			`// classificationLikelihood returns P(W\|C_j) -- the probability of seeing a`
			`// particular word W in a document of this class.`
			`func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) {`

			`gN, jN, err := classificationLikelihoodWordcounts(db, word)`
			`if err != nil {`
			`return g, j, err`
			`}`

wrong prior calculation 2017-05-15 22:26:44 +00:00			`gTotal, jTotal, err := classificationStatistics(db)`
make gometalinter happier 2017-05-13 22:34:54 +00:00			`if err != nil {`
			`return g, j, err`
			`}`

			`g = gN / gTotal`
			`j = jN / jTotal`

fix some static code errors 2017-05-10 17:23:25 +00:00			`return g, j, err`
many new things... 2017-05-08 03:29:25 +00:00			`}`

			`// classificationWord produces the conditional probability of a word belonging`
			`// to good or junk using the classic Bayes' rule.`
			`func classificationWord(db *bolt.DB, word string) (g float64, err error) {`

			`priorG, err := classificationPrior(db)`
			`if err != nil {`
			`return g, err`
			`}`

			`likelihoodG, likelihoodJ, err := classificationLikelihood(db, word)`
			`if err != nil {`
			`return g, err`
			`}`

			`g = (likelihoodG * priorG) / (likelihoodGpriorG + likelihoodJ(1-priorG))`

			`return g, nil`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`}`

implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00			`// Classify analyses a new mail (a mail that arrived in the "new" directory),`
			`// decides whether it is junk and -- if so -- moves it to the Junk folder. If`
			`// it is not junk, the mail is untouched so it can be handled by the mail`
			`// client.`
glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`func (m Mail) Classify(db bolt.DB, dir Maildir) (err error) {`

			`m.New = true`

			`err = m.Load(dir)`
			`if err != nil {`
			`return err`
			`}`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00
make gometalinter happier 2017-05-13 22:34:54 +00:00			`list, err := m.cleanWordlist()`
fix some static code errors 2017-05-10 17:23:25 +00:00			`if err != nil {`
			`return err`
			`}`

glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`junk, prob, err := Junk(db, list)`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00			`if err != nil {`
			`return err`
			`}`

glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`m.Junk = junk`

use logrus properly (fixes #1) 2017-05-23 20:43:17 +00:00			`log.WithFields(log.Fields{`
glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`"mail": m.Key,`
			`"junk": m.Junk,`
			`"probability": prob,`
			`"dir": string(dir),`
use logrus properly (fixes #1) 2017-05-23 20:43:17 +00:00			`}).Info("Classified")`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00
			`// Move mail around if junk.`
			`if junk {`
Use path/filepath for cleaner and safer path generation 2018-01-11 20:35:27 +00:00			`err = os.Rename(filepath.Join(string(dir), "new", m.Key), filepath.Join(string(dir), ".Junk", "cur", m.Key))`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00			`if err != nil {`
			`return err`
			`}`
use logrus properly (fixes #1) 2017-05-23 20:43:17 +00:00			`log.WithFields(log.Fields{`
			`"mail": m.Key,`
			`}).Info("Moved to Junk folder")`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00			`}`

improve memory footprint 2017-09-16 22:56:17 +00:00			`err = m.Unload(dir)`

			`return err`
implement learning and classifying using Bayes' rule and Hyperloglog data structures -- still way to go though. 2017-05-10 03:45:11 +00:00			`}`

many new things... 2017-05-08 03:29:25 +00:00			`// Junk returns true if the wordlist is classified as a junk mail using Bayes'`
add classify test and fix other tests 2017-05-13 21:22:23 +00:00			`// rule. If required, it also returns the calculated probability of being junk,`
			`// but this is typically not needed.`
			`func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) {`
many new things... 2017-05-08 03:29:25 +00:00			`var probabilities []float64`

glide up. load mails correctly before learning or classifying them. improve some logging messages 2017-06-05 13:33:32 +00:00			`// initial value should be no junk`
			`prob = 1.0`

many new things... 2017-05-08 03:29:25 +00:00			`for _, val := range wordlist {`
classify also if nothing have ever been learned 2017-05-28 20:53:43 +00:00			`var p float64`
			`p, err = classificationWord(db, val)`
many new things... 2017-05-08 03:29:25 +00:00			`if err != nil {`
classify also if nothing have ever been learned 2017-05-28 20:53:43 +00:00			`return false, 0.0, err`
many new things... 2017-05-08 03:29:25 +00:00			`}`
			`probabilities = append(probabilities, p)`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`}`

Do not calculate harmonic mean if there is no probability -- otherwise it crashes 2017-05-27 20:32:23 +00:00			`if len(probabilities) > 0 {`
			`prob = stat.HarmonicMean(probabilities, nil)`
classify also if nothing have ever been learned 2017-05-28 20:53:43 +00:00			`}`
			`if prob < 0.5 {`
			`return true, (1 - prob), err`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`}`

classify also if nothing have ever been learned 2017-05-28 20:53:43 +00:00			`return false, (1 - prob), err`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`}`