if word list is too long, only take a random subsample (fixes #11)

master
Carlo Strub 6 years ago
parent de13cdbcd5
commit 54af109661
No known key found for this signature in database
GPG Key ID: 34EF3FF33C29811A

@ -1,8 +1,10 @@
package sisyphus
import (
"math/rand"
"os"
"path/filepath"
"time"
log "github.com/sirupsen/logrus"
@ -202,6 +204,26 @@ func (m *Mail) Classify(db *bolt.DB, dir Maildir) (err error) {
func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) {
var probabilities []float64
// If the wordlist is too long, let us only select a random sample
// for analysis. This prevents cheating by adding lots of good text
// to a Junk mail
if len(wordlist) > 50 {
wordlistTemp := make(map[string]interface{})
rand.Seed(time.Now().UnixNano())
for len(wordlistTemp) < 50 {
wordlistTemp[wordlist[rand.Intn(len(wordlist)-1)]] = nil
}
var wordlistTempSlice []string
for key, _ := range wordlistTemp {
wordlistTempSlice = append(wordlistTempSlice, key)
}
wordlist = wordlistTempSlice
}
// initial value should be no junk
prob = 1.0

@ -124,4 +124,97 @@ var _ = Describe("Classify Mails", func() {
})
})
Context("Only classify a random subset of the words in overly long mails", func() {
BeforeEach(func() {
// Load empty Maildir2
err = LoadMaildirs([]Maildir{
"test/Maildir2",
})
Ω(err).ShouldNot(HaveOccurred())
// Load db
dbs, err = LoadDatabases([]Maildir{
"test/Maildir2",
})
Ω(err).ShouldNot(HaveOccurred())
})
AfterEach(func() {
// Cleanup
CloseDatabases(dbs)
err = os.RemoveAll("test/Maildir2")
Ω(err).ShouldNot(HaveOccurred())
})
It("learned nothing and thus return always good", func() {
_, _, err := Junk(dbs["test/Maildir2"], []string{
"Carlo",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"43",
"44",
"45",
"46",
"47",
"48",
"49",
"50",
"51",
"52",
"53",
"54",
"55",
"56",
"57",
"58",
"59",
})
Ω(err).ShouldNot(HaveOccurred())
})
})
})

@ -225,18 +225,14 @@ func wordlist(s string) (l []string, err error) {
}
}
// only the first 200 words count
maxWords := int(math.Min(200, float64(len(clean))))
// only the first 1000 words count
maxWords := int(math.Min(1000, float64(len(clean))))
for i := 0; i < maxWords; i++ {
w := clean[i]
list[w]++
}
for word, count := range list {
if count > 10 {
continue
}
for word := range list {
l = append(l, word)
}

Loading…
Cancel
Save