sisyphus/bayesian.go

/*
Part of this code is borrowed from github.com/jbrukh/bayesian published under a BSD3CLAUSE License
*/

package sisyphus

import (
	"math"
	"strconv"

	"github.com/boltdb/bolt"
)

// classificationPriors returns the prior probabilities for good and junk
// classes.
func classificationPriors(db *bolt.DB) (g, j float64) {

	db.View(func(tx *bolt.Tx) error {
		b := tx.Bucket([]byte("Wordlists"))
		good := b.Bucket([]byte("Good"))
		gN := float64(good.Stats().KeyN)
		junk := b.Bucket([]byte("Junk"))
		jN := float64(junk.Stats().KeyN)

		g = gN / (gN + jN)
		j = jN / (gN + jN)

		return nil
	})

	return
}

// classificationWordProb returns P(W|C_j) -- the probability of seeing
// a particular word W in a document of this class.
func classificationWordProb(db *bolt.DB, word string) (g, j float64) {

	db.View(func(tx *bolt.Tx) error {
		b := tx.Bucket([]byte("Wordlists"))
		good := b.Bucket([]byte("Good"))
		gNString := string(good.Get([]byte(word)))
		gN, _ := strconv.ParseFloat(gNString, 64)
		junk := b.Bucket([]byte("Junk"))
		jNString := string(junk.Get([]byte(word)))
		jN, _ := strconv.ParseFloat(jNString, 64)

		p := tx.Bucket([]byte("Processed"))
		counters := p.Bucket([]byte("Counters"))
		jString := string(counters.Get([]byte("Junk")))
		j, _ = strconv.ParseFloat(jString, 64)
		mails := p.Bucket([]byte("Mails"))
		pN := mails.Stats().KeyN

		g = gN / (float64(pN) - j)
		j = jN / j

		return nil
	})

	return g, j
}

// LogScores produces "log-likelihood"-like scores that can
// be used to classify documents into classes.
//
// The value of the score is proportional to the likelihood,
// as determined by the classifier, that the given document
// belongs to the given class. This is true even when scores
// returned are negative, which they will be (since we are
// taking logs of probabilities).
//
// The index j of the score corresponds to the class given
// by c.Classes[j].
//
// Additionally returned are "inx" and "strict" values. The
// inx corresponds to the maximum score in the array. If more
// than one of the scores holds the maximum values, then
// strict is false.
//
// Unlike c.Probabilities(), this function is not prone to
// floating point underflow and is relatively safe to use.
func LogScores(db *bolt.DB, wordlist []string) (scoreG, scoreJ float64, junk bool) {

	priorG, priorJ := classificationPriors(db)

	// calculate the scores
	scoreG = math.Log(priorG)
	scoreJ = math.Log(priorJ)
	for _, word := range wordlist {
		gP, jP := classificationWordProb(db, word)
		scoreG += math.Log(gP)
		scoreJ += math.Log(jP)
	}

	if scoreJ == math.Max(scoreG, scoreJ) {
		junk = true
	}

	return scoreG, scoreJ, junk
}
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`/*`
			`Part of this code is borrowed from github.com/jbrukh/bayesian published under a BSD3CLAUSE License`
			`*/`

separate out command and package 2017-04-15 20:23:26 +00:00			`package sisyphus`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
			`import (`
			`"math"`
			`"strconv"`

			`"github.com/boltdb/bolt"`
			`)`

			`// classificationPriors returns the prior probabilities for good and junk`
			`// classes.`
			`func classificationPriors(db *bolt.DB) (g, j float64) {`

			`db.View(func(tx *bolt.Tx) error {`
			`b := tx.Bucket([]byte("Wordlists"))`
			`good := b.Bucket([]byte("Good"))`
classify 2017-03-19 21:43:14 +00:00			`gN := float64(good.Stats().KeyN)`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`junk := b.Bucket([]byte("Junk"))`
classify 2017-03-19 21:43:14 +00:00			`jN := float64(junk.Stats().KeyN)`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
classify 2017-03-19 21:43:14 +00:00			`g = gN / (gN + jN)`
			`j = jN / (gN + jN)`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00
			`return nil`
			`})`

			`return`
			`}`

			`// classificationWordProb returns P(W\|C_j) -- the probability of seeing`
			`// a particular word W in a document of this class.`
			`func classificationWordProb(db *bolt.DB, word string) (g, j float64) {`

			`db.View(func(tx *bolt.Tx) error {`
			`b := tx.Bucket([]byte("Wordlists"))`
			`good := b.Bucket([]byte("Good"))`
			`gNString := string(good.Get([]byte(word)))`
			`gN, _ := strconv.ParseFloat(gNString, 64)`
			`junk := b.Bucket([]byte("Junk"))`
			`jNString := string(junk.Get([]byte(word)))`
			`jN, _ := strconv.ParseFloat(jNString, 64)`

			`p := tx.Bucket([]byte("Processed"))`
			`counters := p.Bucket([]byte("Counters"))`
			`jString := string(counters.Get([]byte("Junk")))`
typo 2017-04-19 07:49:06 +00:00			`j, _ = strconv.ParseFloat(jString, 64)`
add a counter bucket to processed, move some stuff out of main, clean up mail, create bayesian updater 2017-03-19 20:54:23 +00:00			`mails := p.Bucket([]byte("Mails"))`
			`pN := mails.Stats().KeyN`

			`g = gN / (float64(pN) - j)`
			`j = jN / j`

			`return nil`
			`})`

			`return g, j`
			`}`

			`// LogScores produces "log-likelihood"-like scores that can`
			`// be used to classify documents into classes.`
			`//`
			`// The value of the score is proportional to the likelihood,`
			`// as determined by the classifier, that the given document`
			`// belongs to the given class. This is true even when scores`
			`// returned are negative, which they will be (since we are`
			`// taking logs of probabilities).`
			`//`
			`// The index j of the score corresponds to the class given`
			`// by c.Classes[j].`
			`//`
			`// Additionally returned are "inx" and "strict" values. The`
			`// inx corresponds to the maximum score in the array. If more`
			`// than one of the scores holds the maximum values, then`
			`// strict is false.`
			`//`
			`// Unlike c.Probabilities(), this function is not prone to`
			`// floating point underflow and is relatively safe to use.`
			`func LogScores(db *bolt.DB, wordlist []string) (scoreG, scoreJ float64, junk bool) {`

			`priorG, priorJ := classificationPriors(db)`

			`// calculate the scores`
			`scoreG = math.Log(priorG)`
			`scoreJ = math.Log(priorJ)`
			`for _, word := range wordlist {`
			`gP, jP := classificationWordProb(db, word)`
			`scoreG += math.Log(gP)`
			`scoreJ += math.Log(jP)`
			`}`

			`if scoreJ == math.Max(scoreG, scoreJ) {`
			`junk = true`
			`}`

			`return scoreG, scoreJ, junk`
			`}`