mirror of
https://github.com/creekorful/bathyscaphe
synced 2024-11-16 00:12:56 +00:00
Merge remote-tracking branch 'origin/develop' into rabbitmq-refactoring
This commit is contained in:
commit
db983c584b
@ -31,6 +31,7 @@ type ResourceDto struct {
|
|||||||
Title string `json:"title"`
|
Title string `json:"title"`
|
||||||
Meta map[string]string `json:"meta"`
|
Meta map[string]string `json:"meta"`
|
||||||
Description string `json:"description"`
|
Description string `json:"description"`
|
||||||
|
Headers map[string]string `json:"headers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// CredentialsDto represent the credential when logging in the API
|
// CredentialsDto represent the credential when logging in the API
|
||||||
|
1
go.sum
1
go.sum
@ -29,6 +29,7 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb
|
|||||||
github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc=
|
github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc=
|
||||||
github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
|
github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4=
|
||||||
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||||
|
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
|
||||||
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||||
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||||
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
|
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
|
||||||
|
@ -20,6 +20,7 @@ type ResourceIdx struct {
|
|||||||
Title string `json:"title"`
|
Title string `json:"title"`
|
||||||
Meta map[string]string `json:"meta"`
|
Meta map[string]string `json:"meta"`
|
||||||
Description string `json:"description"`
|
Description string `json:"description"`
|
||||||
|
Headers map[string]string `json:"headers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// ResSearchParams is the search params used
|
// ResSearchParams is the search params used
|
||||||
@ -32,6 +33,7 @@ type ResSearchParams struct {
|
|||||||
PageSize int
|
PageSize int
|
||||||
PageNumber int
|
PageNumber int
|
||||||
// TODO allow searching by meta
|
// TODO allow searching by meta
|
||||||
|
// TODO allow searching by headers
|
||||||
}
|
}
|
||||||
|
|
||||||
// Database is the interface used to abstract communication
|
// Database is the interface used to abstract communication
|
||||||
|
@ -78,6 +78,7 @@ func (s *svc) addResource(res api.ResourceDto) (api.ResourceDto, error) {
|
|||||||
Title: res.Title,
|
Title: res.Title,
|
||||||
Meta: res.Meta,
|
Meta: res.Meta,
|
||||||
Description: res.Description,
|
Description: res.Description,
|
||||||
|
Headers: res.Headers,
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := s.db.AddResource(doc); err != nil {
|
if err := s.db.AddResource(doc); err != nil {
|
||||||
|
@ -62,6 +62,7 @@ func TestAddResource(t *testing.T) {
|
|||||||
Time: time.Time{},
|
Time: time.Time{},
|
||||||
Meta: map[string]string{"content": "content-meta"},
|
Meta: map[string]string{"content": "content-meta"},
|
||||||
Description: "the description",
|
Description: "the description",
|
||||||
|
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
|
||||||
})
|
})
|
||||||
|
|
||||||
s := svc{db: dbMock}
|
s := svc{db: dbMock}
|
||||||
@ -73,6 +74,7 @@ func TestAddResource(t *testing.T) {
|
|||||||
Time: time.Time{},
|
Time: time.Time{},
|
||||||
Meta: map[string]string{"content": "content-meta"},
|
Meta: map[string]string{"content": "content-meta"},
|
||||||
Description: "the description",
|
Description: "the description",
|
||||||
|
Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"},
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.FailNow()
|
t.FailNow()
|
||||||
@ -96,6 +98,12 @@ func TestAddResource(t *testing.T) {
|
|||||||
if res.Description != "the description" {
|
if res.Description != "the description" {
|
||||||
t.FailNow()
|
t.FailNow()
|
||||||
}
|
}
|
||||||
|
if res.Headers["Content-Type"] != "application/html" {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
if res.Headers["Server"] != "Traefik" {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestScheduleURL(t *testing.T) {
|
func TestScheduleURL(t *testing.T) {
|
||||||
|
@ -3,6 +3,7 @@ package crawler
|
|||||||
import (
|
import (
|
||||||
"crypto/tls"
|
"crypto/tls"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/creekorful/trandoshan/internal/http"
|
||||||
"github.com/creekorful/trandoshan/internal/logging"
|
"github.com/creekorful/trandoshan/internal/logging"
|
||||||
"github.com/creekorful/trandoshan/internal/messaging"
|
"github.com/creekorful/trandoshan/internal/messaging"
|
||||||
"github.com/creekorful/trandoshan/internal/util"
|
"github.com/creekorful/trandoshan/internal/util"
|
||||||
@ -11,6 +12,7 @@ import (
|
|||||||
"github.com/valyala/fasthttp"
|
"github.com/valyala/fasthttp"
|
||||||
"github.com/valyala/fasthttp/fasthttpproxy"
|
"github.com/valyala/fasthttp/fasthttpproxy"
|
||||||
"io"
|
"io"
|
||||||
|
"io/ioutil"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -57,7 +59,7 @@ func execute(ctx *cli.Context) error {
|
|||||||
Msg("Starting tdsh-crawler")
|
Msg("Starting tdsh-crawler")
|
||||||
|
|
||||||
// Create the HTTP client
|
// Create the HTTP client
|
||||||
httpClient := &fasthttp.Client{
|
httpClient := http.NewFastHTTPClient(&fasthttp.Client{
|
||||||
// Use given TOR proxy to reach the hidden services
|
// Use given TOR proxy to reach the hidden services
|
||||||
Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")),
|
Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")),
|
||||||
// Disable SSL verification since we do not really care about this
|
// Disable SSL verification since we do not really care about this
|
||||||
@ -65,7 +67,7 @@ func execute(ctx *cli.Context) error {
|
|||||||
ReadTimeout: time.Second * 5,
|
ReadTimeout: time.Second * 5,
|
||||||
WriteTimeout: time.Second * 5,
|
WriteTimeout: time.Second * 5,
|
||||||
Name: ctx.String("user-agent"),
|
Name: ctx.String("user-agent"),
|
||||||
}
|
})
|
||||||
|
|
||||||
// Create the subscriber
|
// Create the subscriber
|
||||||
sub, err := messaging.NewSubscriber(ctx.String("event-srv-uri"))
|
sub, err := messaging.NewSubscriber(ctx.String("event-srv-uri"))
|
||||||
@ -84,14 +86,14 @@ func execute(ctx *cli.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) messaging.MsgHandler {
|
func handleMessage(httpClient http.Client, allowedContentTypes []string) messaging.MsgHandler {
|
||||||
return func(sub messaging.Subscriber, msg io.Reader) error {
|
return func(sub messaging.Subscriber, msg io.Reader) error {
|
||||||
var urlMsg messaging.URLTodoMsg
|
var urlMsg messaging.URLTodoMsg
|
||||||
if err := sub.ReadMsg(msg, &urlMsg); err != nil {
|
if err := sub.ReadMsg(msg, &urlMsg); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
body, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes)
|
body, headers, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error while crawling URL: %s", err)
|
return fmt.Errorf("error while crawling URL: %s", err)
|
||||||
}
|
}
|
||||||
@ -100,6 +102,7 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) me
|
|||||||
res := messaging.NewResourceMsg{
|
res := messaging.NewResourceMsg{
|
||||||
URL: urlMsg.URL,
|
URL: urlMsg.URL,
|
||||||
Body: body,
|
Body: body,
|
||||||
|
Headers: headers,
|
||||||
}
|
}
|
||||||
if err := sub.PublishMsg(&res); err != nil {
|
if err := sub.PublishMsg(&res); err != nil {
|
||||||
return fmt.Errorf("error while publishing resource: %s", err)
|
return fmt.Errorf("error while publishing resource: %s", err)
|
||||||
@ -109,34 +112,17 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) me
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []string) (string, error) {
|
func crawURL(httpClient http.Client, url string, allowedContentTypes []string) (string, map[string]string, error) {
|
||||||
log.Debug().Str("url", url).Msg("Processing URL")
|
log.Debug().Str("url", url).Msg("Processing URL")
|
||||||
|
|
||||||
// Query the website
|
r, err := httpClient.Get(url)
|
||||||
req := fasthttp.AcquireRequest()
|
if err != nil {
|
||||||
resp := fasthttp.AcquireResponse()
|
return "", nil, err
|
||||||
defer fasthttp.ReleaseRequest(req)
|
|
||||||
defer fasthttp.ReleaseResponse(resp)
|
|
||||||
|
|
||||||
req.SetRequestURI(url)
|
|
||||||
|
|
||||||
if err := httpClient.Do(req, resp); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
switch code := resp.StatusCode(); {
|
|
||||||
case code > 302:
|
|
||||||
return "", fmt.Errorf("non-managed error code %d", code)
|
|
||||||
// follow redirect
|
|
||||||
case code == 301 || code == 302:
|
|
||||||
if location := string(resp.Header.Peek("Location")); location != "" {
|
|
||||||
return crawURL(httpClient, location, allowedContentTypes)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determinate if content type is allowed
|
// Determinate if content type is allowed
|
||||||
allowed := false
|
allowed := false
|
||||||
contentType := string(resp.Header.Peek("Content-Type"))
|
contentType := r.Headers()["Content-Type"]
|
||||||
for _, allowedContentType := range allowedContentTypes {
|
for _, allowedContentType := range allowedContentTypes {
|
||||||
if strings.Contains(contentType, allowedContentType) {
|
if strings.Contains(contentType, allowedContentType) {
|
||||||
allowed = true
|
allowed = true
|
||||||
@ -146,8 +132,13 @@ func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []stri
|
|||||||
|
|
||||||
if !allowed {
|
if !allowed {
|
||||||
err := fmt.Errorf("forbidden content type : %s", contentType)
|
err := fmt.Errorf("forbidden content type : %s", contentType)
|
||||||
return "", err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return string(resp.Body()), nil
|
// Ready body
|
||||||
|
b, err := ioutil.ReadAll(r.Body())
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, err
|
||||||
|
}
|
||||||
|
return string(b), r.Headers(), nil
|
||||||
}
|
}
|
||||||
|
@ -1 +1,118 @@
|
|||||||
package crawler
|
package crawler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"github.com/creekorful/trandoshan/internal/http_mock"
|
||||||
|
"github.com/creekorful/trandoshan/internal/messaging"
|
||||||
|
"github.com/creekorful/trandoshan/internal/messaging_mock"
|
||||||
|
"github.com/golang/mock/gomock"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCrawlURLForbiddenContentType(t *testing.T) {
|
||||||
|
mockCtrl := gomock.NewController(t)
|
||||||
|
defer mockCtrl.Finish()
|
||||||
|
|
||||||
|
httpClientMock := http_mock.NewMockClient(mockCtrl)
|
||||||
|
url := "https://example.onion"
|
||||||
|
allowedContentTypes := []string{"text/plain"}
|
||||||
|
|
||||||
|
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
|
||||||
|
httpResponseMock.EXPECT().Headers().Return(map[string]string{"Content-Type": "image/png"})
|
||||||
|
|
||||||
|
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
|
||||||
|
|
||||||
|
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
|
||||||
|
if body != "" || headers != nil || err == nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCrawlURLSameContentType(t *testing.T) {
|
||||||
|
mockCtrl := gomock.NewController(t)
|
||||||
|
defer mockCtrl.Finish()
|
||||||
|
|
||||||
|
httpClientMock := http_mock.NewMockClient(mockCtrl)
|
||||||
|
url := "https://example.onion"
|
||||||
|
allowedContentTypes := []string{"text/plain"}
|
||||||
|
|
||||||
|
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
|
||||||
|
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"})
|
||||||
|
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
|
||||||
|
|
||||||
|
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
|
||||||
|
|
||||||
|
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
|
||||||
|
if err != nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if body != "Hello" {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if len(headers) != 1 {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if headers["Content-Type"] != "text/plain" {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCrawlURLNoContentTypeFiltering(t *testing.T) {
|
||||||
|
mockCtrl := gomock.NewController(t)
|
||||||
|
defer mockCtrl.Finish()
|
||||||
|
|
||||||
|
httpClientMock := http_mock.NewMockClient(mockCtrl)
|
||||||
|
url := "https://example.onion"
|
||||||
|
allowedContentTypes := []string{""}
|
||||||
|
|
||||||
|
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
|
||||||
|
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"})
|
||||||
|
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
|
||||||
|
|
||||||
|
httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil)
|
||||||
|
|
||||||
|
body, headers, err := crawURL(httpClientMock, url, allowedContentTypes)
|
||||||
|
if err != nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if body != "Hello" {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if len(headers) != 1 {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
if headers["Content-Type"] != "text/plain" {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleMessage(t *testing.T) {
|
||||||
|
mockCtrl := gomock.NewController(t)
|
||||||
|
defer mockCtrl.Finish()
|
||||||
|
|
||||||
|
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
|
||||||
|
httpClientMock := http_mock.NewMockClient(mockCtrl)
|
||||||
|
httpResponseMock := http_mock.NewMockResponse(mockCtrl)
|
||||||
|
|
||||||
|
msg := bytes.NewReader(nil)
|
||||||
|
subscriberMock.EXPECT().
|
||||||
|
ReadMsg(msg, &messaging.URLTodoMsg{}).
|
||||||
|
SetArg(1, messaging.URLTodoMsg{URL: "https://example.onion/image.png?id=12&test=2"}).
|
||||||
|
Return(nil)
|
||||||
|
|
||||||
|
httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain", "Server": "Debian"})
|
||||||
|
httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello"))
|
||||||
|
|
||||||
|
httpClientMock.EXPECT().Get("https://example.onion/image.png?id=12&test=2").Return(httpResponseMock, nil)
|
||||||
|
|
||||||
|
subscriberMock.EXPECT().PublishMsg(&messaging.NewResourceMsg{
|
||||||
|
URL: "https://example.onion/image.png?id=12&test=2",
|
||||||
|
Body: "Hello",
|
||||||
|
Headers: map[string]string{"Content-Type": "text/plain", "Server": "Debian"},
|
||||||
|
}).Return(nil)
|
||||||
|
|
||||||
|
if err := handleMessage(httpClientMock, []string{"text/plain", "text/css"})(subscriberMock, msg); err != nil {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -79,6 +79,7 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error while extracting resource: %s", err)
|
return fmt.Errorf("error while extracting resource: %s", err)
|
||||||
}
|
}
|
||||||
|
resDto.Headers = resMsg.Headers
|
||||||
|
|
||||||
// Submit to the API
|
// Submit to the API
|
||||||
_, err = apiClient.AddResource(resDto)
|
_, err = apiClient.AddResource(resDto)
|
||||||
@ -87,7 +88,15 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Finally push found URLs
|
// Finally push found URLs
|
||||||
|
publishedURLS := map[string]string{}
|
||||||
for _, url := range urls {
|
for _, url := range urls {
|
||||||
|
if _, exist := publishedURLS[url]; exist {
|
||||||
|
log.Trace().
|
||||||
|
Str("url", url).
|
||||||
|
Msg("Skipping duplicate URL")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
log.Trace().
|
log.Trace().
|
||||||
Str("url", url).
|
Str("url", url).
|
||||||
Msg("Publishing found URL")
|
Msg("Publishing found URL")
|
||||||
@ -98,6 +107,8 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler {
|
|||||||
Str("err", err.Error()).
|
Str("err", err.Error()).
|
||||||
Msg("Error while publishing URL")
|
Msg("Error while publishing URL")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
publishedURLS[url] = url
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -42,12 +42,15 @@ This is sparta
|
|||||||
t.Fail()
|
t.Fail()
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(urls) == 0 {
|
if len(urls) != 2 {
|
||||||
t.FailNow()
|
t.FailNow()
|
||||||
}
|
}
|
||||||
if urls[0] != "https://google.com/test?test=test" {
|
if urls[0] != "https://google.com/test?test=test" {
|
||||||
t.Fail()
|
t.Fail()
|
||||||
}
|
}
|
||||||
|
if urls[1] != "https://example.org" {
|
||||||
|
t.Fail()
|
||||||
|
}
|
||||||
|
|
||||||
if resDto.Description != "Zhello world" {
|
if resDto.Description != "Zhello world" {
|
||||||
t.Fail()
|
t.Fail()
|
||||||
@ -77,7 +80,7 @@ func TestHandleMessage(t *testing.T) {
|
|||||||
body := `
|
body := `
|
||||||
<title>Creekorful Inc</title>
|
<title>Creekorful Inc</title>
|
||||||
|
|
||||||
This is sparta
|
This is sparta (hosted on https://example.org)
|
||||||
|
|
||||||
<a href="https://google.com/test?test=test#12">
|
<a href="https://google.com/test?test=test#12">
|
||||||
|
|
||||||
@ -93,8 +96,11 @@ This is sparta
|
|||||||
msg := bytes.NewReader(nil)
|
msg := bytes.NewReader(nil)
|
||||||
subscriberMock.EXPECT().
|
subscriberMock.EXPECT().
|
||||||
ReadMsg(msg, &messaging.NewResourceMsg{}).
|
ReadMsg(msg, &messaging.NewResourceMsg{}).
|
||||||
SetArg(1, messaging.NewResourceMsg{URL: "https://example.onion", Body: body}).
|
SetArg(1, messaging.NewResourceMsg{
|
||||||
Return(nil)
|
URL: "https://example.onion",
|
||||||
|
Body: body,
|
||||||
|
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
|
||||||
|
}).Return(nil)
|
||||||
|
|
||||||
// make sure we are creating the resource
|
// make sure we are creating the resource
|
||||||
apiClientMock.EXPECT().AddResource(&resMatcher{target: api.ResourceDto{
|
apiClientMock.EXPECT().AddResource(&resMatcher{target: api.ResourceDto{
|
||||||
@ -103,9 +109,12 @@ This is sparta
|
|||||||
Title: "Creekorful Inc",
|
Title: "Creekorful Inc",
|
||||||
Meta: map[string]string{"description": "Zhello world", "og:url": "https://example.org"},
|
Meta: map[string]string{"description": "Zhello world", "og:url": "https://example.org"},
|
||||||
Description: "Zhello world",
|
Description: "Zhello world",
|
||||||
|
Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"},
|
||||||
}}).Return(api.ResourceDto{}, nil)
|
}}).Return(api.ResourceDto{}, nil)
|
||||||
|
|
||||||
// make sure we are pushing found URLs
|
// make sure we are pushing found URLs
|
||||||
|
|
||||||
|
// should be called only one time
|
||||||
subscriberMock.EXPECT().
|
subscriberMock.EXPECT().
|
||||||
PublishMsg(&messaging.URLFoundMsg{URL: "https://example.org"}).
|
PublishMsg(&messaging.URLFoundMsg{URL: "https://example.org"}).
|
||||||
Return(nil)
|
Return(nil)
|
||||||
@ -118,7 +127,7 @@ This is sparta
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// custom matcher to ignore time field when doing comparison
|
// custom matcher to ignore time field when doing comparison ;(
|
||||||
// todo: do less crappy?
|
// todo: do less crappy?
|
||||||
type resMatcher struct {
|
type resMatcher struct {
|
||||||
target api.ResourceDto
|
target api.ResourceDto
|
||||||
@ -131,7 +140,9 @@ func (rm *resMatcher) Matches(x interface{}) bool {
|
|||||||
arg.URL == rm.target.URL &&
|
arg.URL == rm.target.URL &&
|
||||||
arg.Body == rm.target.Body &&
|
arg.Body == rm.target.Body &&
|
||||||
arg.Description == rm.target.Description &&
|
arg.Description == rm.target.Description &&
|
||||||
exactMatch(arg.Meta, rm.target.Meta)
|
exactMatch(arg.Meta, rm.target.Meta) &&
|
||||||
|
arg.Headers["Server"][0] == rm.target.Headers["Server"][0] &&
|
||||||
|
arg.Headers["Content-Type"] == rm.target.Headers["Content-Type"] // TODO allow other headers comparison
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rm *resMatcher) String() string {
|
func (rm *resMatcher) String() string {
|
||||||
|
52
internal/http/client.go
Normal file
52
internal/http/client.go
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
//go:generate mockgen -destination=../http_mock/client_mock.go -package=http_mock . Client
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/valyala/fasthttp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Client is an HTTP client
|
||||||
|
type Client interface {
|
||||||
|
// Get the corresponding URL
|
||||||
|
// this methods follows redirections
|
||||||
|
Get(URL string) (Response, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type client struct {
|
||||||
|
c *fasthttp.Client
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewFastHTTPClient create a new Client using fasthttp.Client as backend
|
||||||
|
func NewFastHTTPClient(c *fasthttp.Client) Client {
|
||||||
|
return &client{c: c}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *client) Get(URL string) (Response, error) {
|
||||||
|
req := fasthttp.AcquireRequest()
|
||||||
|
resp := fasthttp.AcquireResponse()
|
||||||
|
defer fasthttp.ReleaseRequest(req)
|
||||||
|
defer fasthttp.ReleaseResponse(resp)
|
||||||
|
|
||||||
|
req.SetRequestURI(URL)
|
||||||
|
|
||||||
|
if err := c.c.Do(req, resp); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
switch code := resp.StatusCode(); {
|
||||||
|
case code > 302:
|
||||||
|
return nil, fmt.Errorf("non-managed error code %d", code)
|
||||||
|
// follow redirect
|
||||||
|
case code == 301 || code == 302:
|
||||||
|
if location := string(resp.Header.Peek("Location")); location != "" {
|
||||||
|
return c.Get(location)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
r := &response{}
|
||||||
|
resp.CopyTo(&r.raw)
|
||||||
|
|
||||||
|
return r, nil
|
||||||
|
}
|
33
internal/http/response.go
Normal file
33
internal/http/response.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package http
|
||||||
|
|
||||||
|
//go:generate mockgen -destination=../http_mock/response_mock.go -package=http_mock . Response
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"github.com/valyala/fasthttp"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Response is an HTTP response
|
||||||
|
type Response interface {
|
||||||
|
// Headers returns the response headers
|
||||||
|
Headers() map[string]string
|
||||||
|
// Body return the response body
|
||||||
|
Body() io.Reader
|
||||||
|
}
|
||||||
|
|
||||||
|
type response struct {
|
||||||
|
raw fasthttp.Response
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *response) Headers() map[string]string {
|
||||||
|
headers := map[string]string{}
|
||||||
|
r.raw.Header.VisitAll(func(key, value []byte) {
|
||||||
|
headers[string(key)] = string(value) // TODO manage multiple values?
|
||||||
|
})
|
||||||
|
return headers
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *response) Body() io.Reader {
|
||||||
|
return bytes.NewReader(r.raw.Body())
|
||||||
|
}
|
@ -41,6 +41,7 @@ func (msg *URLFoundMsg) Subject() string {
|
|||||||
type NewResourceMsg struct {
|
type NewResourceMsg struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
Body string `json:"body"`
|
Body string `json:"body"`
|
||||||
|
Headers map[string]string `json:"headers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Subject returns the subject where message should be push
|
// Subject returns the subject where message should be push
|
||||||
|
@ -92,6 +92,12 @@ func handleMessage(apiClient api.Client, refreshDelay time.Duration, forbiddenEx
|
|||||||
return nil // Technically not an error
|
return nil // Technically not an error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Make sure protocol is allowed
|
||||||
|
if !strings.HasPrefix(u.Scheme, "http") {
|
||||||
|
log.Trace().Stringer("url", u).Msg("URL has invalid scheme")
|
||||||
|
return nil // Technically not an error
|
||||||
|
}
|
||||||
|
|
||||||
// Make sure extension is not forbidden
|
// Make sure extension is not forbidden
|
||||||
for _, ext := range forbiddenExtensions {
|
for _, ext := range forbiddenExtensions {
|
||||||
if strings.HasSuffix(u.Path, "."+ext) {
|
if strings.HasSuffix(u.Path, "."+ext) {
|
||||||
|
@ -2,6 +2,7 @@ package scheduler
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"fmt"
|
||||||
"github.com/creekorful/trandoshan/api"
|
"github.com/creekorful/trandoshan/api"
|
||||||
"github.com/creekorful/trandoshan/api_mock"
|
"github.com/creekorful/trandoshan/api_mock"
|
||||||
"github.com/creekorful/trandoshan/internal/messaging"
|
"github.com/creekorful/trandoshan/internal/messaging"
|
||||||
@ -47,6 +48,27 @@ func TestHandleMessageNotOnion(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleMessageWrongProtocol(t *testing.T) {
|
||||||
|
mockCtrl := gomock.NewController(t)
|
||||||
|
defer mockCtrl.Finish()
|
||||||
|
|
||||||
|
apiClientMock := api_mock.NewMockClient(mockCtrl)
|
||||||
|
subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl)
|
||||||
|
|
||||||
|
msg := bytes.NewReader(nil)
|
||||||
|
|
||||||
|
for _, protocol := range []string{"irc", "ftp"} {
|
||||||
|
subscriberMock.EXPECT().
|
||||||
|
ReadMsg(msg, &messaging.URLFoundMsg{}).
|
||||||
|
SetArg(1, messaging.URLFoundMsg{URL: fmt.Sprintf("%s://example.onion", protocol)}).
|
||||||
|
Return(nil)
|
||||||
|
|
||||||
|
if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, msg); err != nil {
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleMessageAlreadyCrawled(t *testing.T) {
|
func TestHandleMessageAlreadyCrawled(t *testing.T) {
|
||||||
mockCtrl := gomock.NewController(t)
|
mockCtrl := gomock.NewController(t)
|
||||||
defer mockCtrl.Finish()
|
defer mockCtrl.Finish()
|
||||||
|
Loading…
Reference in New Issue
Block a user