diff --git a/classify.go b/classify.go index 9ce6e04..ffded99 100644 --- a/classify.go +++ b/classify.go @@ -1,8 +1,10 @@ package sisyphus import ( + "math/rand" "os" "path/filepath" + "time" log "github.com/sirupsen/logrus" @@ -202,6 +204,26 @@ func (m *Mail) Classify(db *bolt.DB, dir Maildir) (err error) { func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) { var probabilities []float64 + // If the wordlist is too long, let us only select a random sample + // for analysis. This prevents cheating by adding lots of good text + // to a Junk mail + if len(wordlist) > 50 { + wordlistTemp := make(map[string]interface{}) + + rand.Seed(time.Now().UnixNano()) + + for len(wordlistTemp) < 50 { + wordlistTemp[wordlist[rand.Intn(len(wordlist)-1)]] = nil + + } + + var wordlistTempSlice []string + for key, _ := range wordlistTemp { + wordlistTempSlice = append(wordlistTempSlice, key) + } + wordlist = wordlistTempSlice + } + // initial value should be no junk prob = 1.0 diff --git a/classify_test.go b/classify_test.go index 3002310..90d4116 100644 --- a/classify_test.go +++ b/classify_test.go @@ -124,4 +124,97 @@ var _ = Describe("Classify Mails", func() { }) }) + + Context("Only classify a random subset of the words in overly long mails", func() { + BeforeEach(func() { + // Load empty Maildir2 + err = LoadMaildirs([]Maildir{ + "test/Maildir2", + }) + Ω(err).ShouldNot(HaveOccurred()) + + // Load db + dbs, err = LoadDatabases([]Maildir{ + "test/Maildir2", + }) + Ω(err).ShouldNot(HaveOccurred()) + + }) + AfterEach(func() { + // Cleanup + CloseDatabases(dbs) + + err = os.RemoveAll("test/Maildir2") + Ω(err).ShouldNot(HaveOccurred()) + }) + + It("learned nothing and thus return always good", func() { + + _, _, err := Junk(dbs["test/Maildir2"], []string{ + "Carlo", + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + }) + + Ω(err).ShouldNot(HaveOccurred()) + }) + }) }) diff --git a/mail.go b/mail.go index 3489d4d..ec88157 100644 --- a/mail.go +++ b/mail.go @@ -225,18 +225,14 @@ func wordlist(s string) (l []string, err error) { } } - // only the first 200 words count - maxWords := int(math.Min(200, float64(len(clean)))) + // only the first 1000 words count + maxWords := int(math.Min(1000, float64(len(clean)))) for i := 0; i < maxWords; i++ { w := clean[i] list[w]++ } - for word, count := range list { - if count > 10 { - continue - } - + for word := range list { l = append(l, word) }