Prevent duplicates (enough this time?)

pull/62/head
Aloïs Micard 3 years ago
parent 87b8615d2f
commit de4779724f
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -38,6 +38,10 @@ func GetApp() *cli.App {
Usage: "List of API users. (Format user:password)",
Required: false,
},
&cli.StringFlag{
Name: "refresh-delay",
Usage: "Duration before allowing indexation of existing resource (none = never)",
},
},
Action: execute,
}

@ -3,9 +3,11 @@ package api
import (
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/api/database"
"github.com/creekorful/trandoshan/internal/duration"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"time"
)
type service interface {
@ -16,8 +18,9 @@ type service interface {
}
type svc struct {
db database.Database
pub messaging.Publisher
db database.Database
pub messaging.Publisher
refreshDelay time.Duration
}
func newService(c *cli.Context) (service, error) {
@ -35,9 +38,12 @@ func newService(c *cli.Context) (service, error) {
return nil, err
}
refreshDelay := duration.ParseDuration(c.String("refresh-delay"))
return &svc{
db: db,
pub: pub,
db: db,
pub: pub,
refreshDelay: refreshDelay,
}, nil
}
@ -70,6 +76,34 @@ func (s *svc) searchResources(params *database.ResSearchParams) ([]api.ResourceD
func (s *svc) addResource(res api.ResourceDto) (api.ResourceDto, error) {
log.Debug().Str("url", res.URL).Msg("Saving resource")
// Hacky stuff to prevent from adding 'duplicate resource'
// the thing is: even with the scheduler preventing from crawling 'duplicates' URL by adding a refresh period
// and checking if the resource is not already indexed, this implementation may not work if the URLs was published
// before the resource is saved. And this happen a LOT of time.
// therefore the best thing to do is to make the API check if the resource should **really** be added by checking if
// it isn't present on the database. This may sounds hacky, but it's the best solution i've come up at this time.
endDate := time.Time{}
if s.refreshDelay != -1 {
endDate = time.Now().Add(-s.refreshDelay)
}
count, err := s.db.CountResources(&database.ResSearchParams{
URL: res.URL,
EndDate: endDate,
PageSize: 1,
PageNumber: 1,
})
if err != nil {
log.Err(err).Msg("error while searching for resource")
return api.ResourceDto{}, nil
}
if count > 0 {
// Not an error
log.Debug().Str("url", res.URL).Msg("Skipping duplicate resource")
return res, nil
}
// Create Elasticsearch document
doc := database.ResourceIdx{
URL: res.URL,

@ -55,6 +55,12 @@ func TestAddResource(t *testing.T) {
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(&searchParamsMatcher{target: database.ResSearchParams{
URL: "https://example.onion",
PageSize: 1,
PageNumber: 1,
}}).Return(int64(0), nil)
dbMock.EXPECT().AddResource(database.ResourceIdx{
URL: "https://example.onion",
Body: "TheBody",
@ -65,7 +71,7 @@ func TestAddResource(t *testing.T) {
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
s := svc{db: dbMock}
s := svc{db: dbMock, refreshDelay: 5 * time.Hour}
res, err := s.addResource(api.ResourceDto{
URL: "https://example.onion",
@ -106,6 +112,63 @@ func TestAddResource(t *testing.T) {
}
}
func TestAddResourceDuplicateNotAllowed(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(&searchParamsMatcher{target: database.ResSearchParams{
URL: "https://example.onion",
PageSize: 1,
PageNumber: 1,
}, endDateZero: true}).Return(int64(1), nil)
s := svc{db: dbMock, refreshDelay: -1}
_, err := s.addResource(api.ResourceDto{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
if err != nil {
t.FailNow()
}
}
func TestAddResourceTooYoung(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dbMock := database_mock.NewMockDatabase(mockCtrl)
dbMock.EXPECT().CountResources(&searchParamsMatcher{target: database.ResSearchParams{
URL: "https://example.onion",
EndDate: time.Now().Add(-10 * time.Minute),
PageSize: 1,
PageNumber: 1,
}}).Return(int64(1), nil)
s := svc{db: dbMock, refreshDelay: -10 * time.Minute}
_, err := s.addResource(api.ResourceDto{
URL: "https://example.onion",
Body: "TheBody",
Title: "Example",
Time: time.Time{},
Meta: map[string]string{"content": "content-meta"},
Description: "the description",
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
})
if err != nil {
t.FailNow()
}
}
func TestScheduleURL(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
@ -120,3 +183,20 @@ func TestScheduleURL(t *testing.T) {
t.FailNow()
}
}
// custom matcher to ignore time field when doing comparison ;(
// todo: do less crappy?
type searchParamsMatcher struct {
target database.ResSearchParams
endDateZero bool
}
func (sm *searchParamsMatcher) Matches(x interface{}) bool {
arg := x.(*database.ResSearchParams)
return arg.URL == sm.target.URL && arg.PageSize == sm.target.PageSize && arg.PageNumber == sm.target.PageNumber &&
sm.endDateZero == arg.EndDate.IsZero()
}
func (sm *searchParamsMatcher) String() string {
return "is valid search params"
}

@ -0,0 +1,21 @@
package duration
import (
"github.com/xhit/go-str2duration/v2"
"time"
)
// ParseDuration parse given duration into time.Duration
// or returns -1 if fails
func ParseDuration(duration string) time.Duration {
if duration == "" {
return -1
}
val, err := str2duration.ParseDuration(duration)
if err != nil {
return -1
}
return val
}

@ -0,0 +1,24 @@
package duration
import (
"testing"
"time"
)
func TestParseDuration(t *testing.T) {
if ParseDuration("") != -1 {
t.Fail()
}
if ParseDuration("50s") != time.Second*50 {
t.Fail()
}
if ParseDuration("50m") != time.Minute*50 {
t.Fail()
}
if ParseDuration("50h") != time.Hour*50 {
t.Fail()
}
if ParseDuration("50d") != time.Hour*24*50 {
t.Fail()
}
}

@ -3,12 +3,12 @@ package scheduler
import (
"fmt"
"github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/duration"
"github.com/creekorful/trandoshan/internal/logging"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"github.com/xhit/go-str2duration/v2"
"io"
"net/url"
"strings"
@ -42,7 +42,7 @@ func GetApp() *cli.App {
func execute(ctx *cli.Context) error {
logging.ConfigureLogger(ctx)
refreshDelay := parseRefreshDelay(ctx.String("refresh-delay"))
refreshDelay := duration.ParseDuration(ctx.String("refresh-delay"))
log.Info().
Str("ver", ctx.App.Version).
@ -134,16 +134,3 @@ func handleMessage(apiClient api.Client, refreshDelay time.Duration, forbiddenEx
return nil
}
}
func parseRefreshDelay(delay string) time.Duration {
if delay == "" {
return -1
}
val, err := str2duration.ParseDuration(delay)
if err != nil {
return -1
}
return val
}

@ -12,24 +12,6 @@ import (
"time"
)
func TestParseRefreshDelay(t *testing.T) {
if parseRefreshDelay("") != -1 {
t.Fail()
}
if parseRefreshDelay("50s") != time.Second*50 {
t.Fail()
}
if parseRefreshDelay("50m") != time.Minute*50 {
t.Fail()
}
if parseRefreshDelay("50h") != time.Hour*50 {
t.Fail()
}
if parseRefreshDelay("50d") != time.Hour*24*50 {
t.Fail()
}
}
func TestHandleMessageNotOnion(t *testing.T) {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()

Loading…
Cancel
Save