Move url extraction logic to extractor

pull/24/head
Aloïs Micard 4 years ago
parent f2b8984356
commit 6081a6a7c2
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -17,6 +17,10 @@ import (
"time" "time"
) )
var (
resourcesIndex = "resources"
)
// Represent a resource in elasticsearch // Represent a resource in elasticsearch
type resourceIndex struct { type resourceIndex struct {
URL string `json:"url"` URL string `json:"url"`
@ -100,7 +104,7 @@ func searchResources(es *elastic.Client) echo.HandlerFunc {
// Perform the search request. // Perform the search request.
query := elastic.NewMatchQuery("url", string(b)) query := elastic.NewMatchQuery("url", string(b))
res, err := es.Search(). res, err := es.Search().
Index("resource"). Index(resourcesIndex).
Query(query). Query(query).
Do(context.Background()) Do(context.Background())
if err != nil { if err != nil {
@ -141,7 +145,7 @@ func addResource(es *elastic.Client) echo.HandlerFunc {
} }
_, err := es.Index(). _, err := es.Index().
Index("resources"). Index(resourcesIndex).
BodyJson(doc). BodyJson(doc).
Do(context.Background()) Do(context.Background())
if err != nil { if err != nil {

@ -1,6 +1,8 @@
package extractor package extractor
import ( import (
"fmt"
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/api" "github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/messaging" "github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util/logging" "github.com/creekorful/trandoshan/internal/util/logging"
@ -121,7 +123,21 @@ func extractResource(msg messaging.NewResourceMsg) (api.ResourceDto, []string, e
// Extract URLs // Extract URLs
xu := xurls.Strict() xu := xurls.Strict()
return resDto, xu.FindAllString(msg.Body, -1), nil
// Sanitize URLs
urls := xu.FindAllString(msg.Body, -1)
var normalizedURLS []string
for _, url := range urls {
normalizedURL, err := normalizeURL(url)
if err != nil {
continue
}
normalizedURLS = append(normalizedURLS, normalizedURL)
}
return resDto, normalizedURLS, nil
} }
// extract title from html body // extract title from html body
@ -138,3 +154,13 @@ func extractTitle(body string) string {
return body[startPos:endPos] return body[startPos:endPos]
} }
func normalizeURL(u string) (string, error) {
normalizedURL, err := purell.NormalizeURLString(u, purell.FlagsUsuallySafeGreedy|
purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes)
if err != nil {
return "", fmt.Errorf("error while normalizing URL %s: %s", u, err)
}
return normalizedURL, nil
}

@ -8,10 +8,10 @@ import (
func TestExtractResource(t *testing.T) { func TestExtractResource(t *testing.T) {
msg := messaging.NewResourceMsg{ msg := messaging.NewResourceMsg{
URL: "https://example.org/300", URL: "https://example.org/300",
Body: "<title>Creekorful Inc</title>This is sparta", Body: "<title>Creekorful Inc</title>This is sparta<a href\"https://google.com/test?test=test#12\"",
} }
resDto, _, err := extractResource(msg) resDto, urls, err := extractResource(msg)
if err != nil { if err != nil {
t.FailNow() t.FailNow()
} }
@ -25,6 +25,13 @@ func TestExtractResource(t *testing.T) {
if resDto.Body != msg.Body { if resDto.Body != msg.Body {
t.Fail() t.Fail()
} }
if len(urls) == 0 {
t.FailNow()
}
if urls[0] != "https://google.com/test?test=test" {
t.Fail()
}
} }
func TestExtractTitle(t *testing.T) { func TestExtractTitle(t *testing.T) {
@ -38,3 +45,14 @@ func TestExtractTitle(t *testing.T) {
t.Errorf("No matches should have been returned") t.Errorf("No matches should have been returned")
} }
} }
func TestNormalizeURL(t *testing.T) {
url, err := normalizeURL("https://this-is-sparta.de?url=url-query-param#fragment-23")
if err != nil {
t.FailNow()
}
if url != "https://this-is-sparta.de?url=url-query-param" {
t.Fail()
}
}

@ -3,7 +3,6 @@ package scheduler
import ( import (
"encoding/base64" "encoding/base64"
"fmt" "fmt"
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/api" "github.com/creekorful/trandoshan/api"
"github.com/creekorful/trandoshan/internal/messaging" "github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util/logging" "github.com/creekorful/trandoshan/internal/util/logging"
@ -74,19 +73,20 @@ func handleMessage(apiClient api.Client) natsutil.MsgHandler {
} }
log.Debug().Str("url", urlMsg.URL).Msg("Processing URL: %s") log.Debug().Str("url", urlMsg.URL).Msg("Processing URL: %s")
normalizedURL, err := normalizeURL(urlMsg.URL)
u, err := url.Parse(urlMsg.URL)
if err != nil { if err != nil {
log.Err(err).Msg("Error while normalizing URL") log.Err(err).Msg("Error while parsing URL")
return err return err
} }
// Make sure URL is valid .onion // Make sure URL is valid .onion
if !strings.Contains(normalizedURL.Host, ".onion") { if !strings.Contains(u.Host, ".onion") {
log.Debug().Stringer("url", normalizedURL).Msg("URL is not a valid hidden service") log.Debug().Stringer("url", u).Msg("URL is not a valid hidden service")
return err return err
} }
b64URI := base64.URLEncoding.EncodeToString([]byte(normalizedURL.String())) b64URI := base64.URLEncoding.EncodeToString([]byte(u.String()))
urls, err := apiClient.SearchResources(b64URI) urls, err := apiClient.SearchResources(b64URI)
if err != nil { if err != nil {
log.Err(err).Msg("Error while searching URL") log.Err(err).Msg("Error while searching URL")
@ -95,29 +95,14 @@ func handleMessage(apiClient api.Client) natsutil.MsgHandler {
// No matches: schedule! // No matches: schedule!
if len(urls) == 0 { if len(urls) == 0 {
log.Debug().Stringer("url", normalizedURL).Msg("URL should be scheduled") log.Debug().Stringer("url", u).Msg("URL should be scheduled")
if err := natsutil.PublishMsg(nc, &messaging.URLTodoMsg{URL: urlMsg.URL}); err != nil { if err := natsutil.PublishMsg(nc, &messaging.URLTodoMsg{URL: urlMsg.URL}); err != nil {
return fmt.Errorf("error while publishing URL: %s", err) return fmt.Errorf("error while publishing URL: %s", err)
} }
} else { } else {
log.Trace().Stringer("url", normalizedURL).Msg("URL should not be scheduled") log.Trace().Stringer("url", u).Msg("URL should not be scheduled")
} }
return nil return nil
} }
} }
func normalizeURL(u string) (*url.URL, error) {
normalizedURL, err := purell.NormalizeURLString(u, purell.FlagsUsuallySafeGreedy|
purell.FlagRemoveDirectoryIndex|purell.FlagRemoveFragment|purell.FlagRemoveDuplicateSlashes)
if err != nil {
return nil, fmt.Errorf("error while normalizing URL %s: %s", u, err)
}
nu, err := url.Parse(normalizedURL)
if err != nil {
return nil, fmt.Errorf("error while parsing URL: %s", err)
}
return nu, nil
}

@ -1,14 +1 @@
package scheduler package scheduler
import "testing"
func TestNormalizeURL(t *testing.T) {
url, err := normalizeURL("https://this-is-sparta.de?url=url-query-param#fragment-23")
if err != nil {
t.FailNow()
}
if url.String() != "https://this-is-sparta.de?url=url-query-param" {
t.Fail()
}
}

Loading…
Cancel
Save