produce word lists

master
Carlo Strub 7 years ago
parent 0a6934b3ba
commit eb24f37ae7

4
glide.lock generated

@ -1,5 +1,5 @@
hash: ec8efbbfd183cdfa97087fa1ff3e6b3a1b7eb77eb129d66473ffbc8c97db2238
updated: 2017-03-09T19:12:30.674604839Z
hash: 05d37c1ae8f818b05d066dce22d2e3fa625b8cce35ddfe4d746eaf49b11320b2
updated: 2017-03-10T19:22:40.441940049Z
imports:
- name: github.com/jbrukh/bayesian
version: bf3f261f9a9c61145c60d47665b0518cc32c774f

@ -1,8 +1,8 @@
package: github.com/carlostrub/sisyphus
import:
- package: github.com/jbrukh/bayesian
- package: github.com/luksen/maildir
- package: github.com/kennygrant/sanitize
- package: github.com/luksen/maildir
testImport:
- package: github.com/onsi/ginkgo
- package: github.com/onsi/gomega

@ -4,6 +4,7 @@ import (
"bufio"
"errors"
"mime/quotedprintable"
"regexp"
"strings"
"github.com/kennygrant/sanitize"
@ -112,7 +113,42 @@ func cleanString(i string) (s string, err error) {
return s, nil
}
// Clean prepares the mail's subject and body for training
// wordlist takes a string of space separated text and returns a list of unique
// words in a space separated string
func wordlist(s string) (l []string, err error) {
list := make(map[string]int)
raw := strings.Split(s, " ")
for _, i := range raw {
// no long or too short words
length := len(i)
if length < 4 || length > 10 {
continue
}
// no numbers, special characters, etc. -- only words
match, _ := regexp.MatchString("(^[a-z]+$)", i)
if !match {
continue
} else {
list[i]++
}
}
for word, count := range list {
if count > 10 {
continue
}
l = append(l, word)
}
return l, nil
}
// Clean cleans the mail's subject and body
func (m *Mail) Clean() error {
if m.Subject != nil {
s, err := cleanString(*m.Subject)
@ -132,6 +168,25 @@ func (m *Mail) Clean() error {
return nil
}
// Wordlists prepares the mail's subject and body for training
func (m *Mail) Wordlists() (subject, body []string, err error) {
if m.Subject != nil {
subject, err = wordlist(*m.Subject)
if err != nil {
return subject, body, err
}
}
if m.Body != nil {
body, err = wordlist(*m.Body)
if err != nil {
return subject, body, err
}
}
return subject, body, nil
}
// Load reads a mail's subject and body
func (m *Mail) Load(d string) error {

@ -1,13 +1,15 @@
package main_test
import (
"sort"
s "github.com/carlostrub/sisyphus"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
var _ = Describe("Main", func() {
var _ = Describe("Mail", func() {
Context("Maildir", func() {
It("Create a slice of mail keys", func() {
@ -262,5 +264,155 @@ var _ = Describe("Main", func() {
Junk: true,
}))
})
It("Wordlist 1", func() {
m := s.Mail{
Key: "1488181583.M633084P4781.mail.carlostrub.ch,S=708375,W=720014:2,a",
Subject: nil,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
Ω(subject).Should(Equal(
[]string{"confirm", "remittance"}))
Ω(body).Should(Equal(
[]string{"accuracy", "addressed", "admin", "alliance", "alone", "bank", "been", "belong", "best", "boltas", "cobantur", "computer", "confirm", "contained", "copy", "copying", "date", "deleted", "detail", "director", "entity", "excludes", "expressed", "files", "forwarding", "hereby", "individual", "intended", "kind", "known", "liability", "makes", "message", "notified", "opinions", "payment", "prohibited", "reception", "recipient", "reflect", "regards", "scanned", "sender", "should", "solely", "storage", "strictly", "such", "thanks", "that", "therein", "they", "this", "value", "viruses", "warranty", "whatsoever", "whom", "with"}))
})
It("Wordlist 2", func() {
m := s.Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
Ω(subject).Should(Equal(
[]string{"hello"}))
Ω(body).Should(Equal(
[]string{"best", "company", "dear", "distance", "employees", "from", "home", "interested", "kari", "large", "looking", "manager", "most", "name", "offer", "personnel", "please", "regards", "remotely", "salary", "site", "that", "this", "visit", "work", "working"}))
})
It("Wordlist 3", func() {
m := s.Mail{
Key: "1488226337.M327824P8269.mail.carlostrub.ch,S=8044,W=8167:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
Ω(subject).Should(Equal(
[]string{"herpes", "medical", "shocks", "world"}))
Ω(body).Should(Equal(
[]string{"alongside", "anxiety", "appointed", "authority", "awarded", "bacteria", "beard", "been", "came", "capital", "causes", "city", "civilian", "club", "combated", "creams", "crown", "cure", "cured", "dark", "devalued", "domed", "doukas", "doux", "dreamstime", "drug", "drugs", "earlier", "emperor", "erly", "exclusive", "extracts", "fast", "february", "finally", "forked", "from", "full", "genital", "girl", "give", "golden", "governors", "guard", "have", "held", "herpes", "history", "image", "influence", "instituted", "john", "largesse", "little", "local", "manuscript", "many", "members", "mental", "mice", "military", "mostly", "nicaea", "notables", "only", "other", "people", "portrait", "prevent", "provincial", "rachael", "relief", "remove", "rettner", "sebastos", "secure", "senior", "size", "starting", "studies", "such", "suggest", "that", "theodore", "there", "these", "this", "times", "title", "titles", "today", "topical", "treatment", "treatments", "tzakones", "under", "unlike", "used", "vatatzes", "view", "virus", "wearing", "were", "will", "with", "writer", "your", "zonaras"}))
})
It("Wordlist 4", func() {
m := s.Mail{
Key: "1488226337.M327825P8269.mail.carlostrub.ch,S=802286,W=812785:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
Ω(subject).Should(Equal(
[]string{"cosan", "friday", "march", "york"}))
Ω(body).Should(Equal(
[]string{"ampudia", "avenue", "below", "between", "briget", "call", "cannot", "closing", "cosan", "download", "email", "friday", "here", "hyatt", "image", "invitation", "level", "limited", "listed", "lunch", "march", "mercado", "novo", "nyse", "online", "onyx", "park", "please", "program", "rafferty", "register", "room", "rsvp", "rumo", "second", "street", "taylor", "view", "west", "york"}))
})
It("Wordlist 5", func() {
m := s.Mail{
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
Ω(subject).Should(Equal(
[]string{"eyes", "glasses", "headed", "serious", "trouble", "wear", "your"}))
Ω(body).Should(Equal(
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "ferdinand", "floor", "from", "full", "glasses", "goodness", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "running", "self", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "youll", "your"}))
})
It("Wordlist 6", func() {
m := s.Mail{
Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subject, body, err := m.Wordlists()
sort.Strings(subject)
sort.Strings(body)
Ω(err).ShouldNot(HaveOccurred())
Ω(subject).Should(Equal(
[]string{"always", "form", "good", "super", "viagra", "with"}))
Ω(body).Should(Equal(
[]string{"amazon", "antiviral", "blockquote", "blood", "body", "canada", "cant", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "table", "terry", "these", "this", "time", "trademark", "united", "various", "view", "when", "wish", "with", "your"}))
})
})
})

Loading…
Cancel
Save