diff --git a/api/api.go b/api/api.go index a73a7ab..80eb1ec 100644 --- a/api/api.go +++ b/api/api.go @@ -31,6 +31,7 @@ type ResourceDto struct { Title string `json:"title"` Meta map[string]string `json:"meta"` Description string `json:"description"` + Headers map[string]string `json:"headers"` } // CredentialsDto represent the credential when logging in the API diff --git a/go.sum b/go.sum index d923ee5..3120c49 100644 --- a/go.sum +++ b/go.sum @@ -29,6 +29,7 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc= github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= diff --git a/internal/api/database/database.go b/internal/api/database/database.go index dbd89d4..98cf013 100644 --- a/internal/api/database/database.go +++ b/internal/api/database/database.go @@ -20,6 +20,7 @@ type ResourceIdx struct { Title string `json:"title"` Meta map[string]string `json:"meta"` Description string `json:"description"` + Headers map[string]string `json:"headers"` } // ResSearchParams is the search params used @@ -32,6 +33,7 @@ type ResSearchParams struct { PageSize int PageNumber int // TODO allow searching by meta + // TODO allow searching by headers } // Database is the interface used to abstract communication diff --git a/internal/api/service.go b/internal/api/service.go index a900d01..ae1de56 100644 --- a/internal/api/service.go +++ b/internal/api/service.go @@ -78,6 +78,7 @@ func (s *svc) addResource(res api.ResourceDto) (api.ResourceDto, error) { Title: res.Title, Meta: res.Meta, Description: res.Description, + Headers: res.Headers, } if err := s.db.AddResource(doc); err != nil { diff --git a/internal/api/service_test.go b/internal/api/service_test.go index 79fd53f..4714bbb 100644 --- a/internal/api/service_test.go +++ b/internal/api/service_test.go @@ -62,6 +62,7 @@ func TestAddResource(t *testing.T) { Time: time.Time{}, Meta: map[string]string{"content": "content-meta"}, Description: "the description", + Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"}, }) s := svc{db: dbMock} @@ -73,6 +74,7 @@ func TestAddResource(t *testing.T) { Time: time.Time{}, Meta: map[string]string{"content": "content-meta"}, Description: "the description", + Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"}, }) if err != nil { t.FailNow() @@ -96,6 +98,12 @@ func TestAddResource(t *testing.T) { if res.Description != "the description" { t.FailNow() } + if res.Headers["Content-Type"] != "application/html" { + t.FailNow() + } + if res.Headers["Server"] != "Traefik" { + t.FailNow() + } } func TestScheduleURL(t *testing.T) { diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 5581130..28bdb10 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -3,6 +3,7 @@ package crawler import ( "crypto/tls" "fmt" + "github.com/creekorful/trandoshan/internal/http" "github.com/creekorful/trandoshan/internal/logging" "github.com/creekorful/trandoshan/internal/messaging" "github.com/creekorful/trandoshan/internal/util" @@ -11,6 +12,7 @@ import ( "github.com/valyala/fasthttp" "github.com/valyala/fasthttp/fasthttpproxy" "io" + "io/ioutil" "strings" "time" ) @@ -57,7 +59,7 @@ func execute(ctx *cli.Context) error { Msg("Starting tdsh-crawler") // Create the HTTP client - httpClient := &fasthttp.Client{ + httpClient := http.NewFastHTTPClient(&fasthttp.Client{ // Use given TOR proxy to reach the hidden services Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")), // Disable SSL verification since we do not really care about this @@ -65,7 +67,7 @@ func execute(ctx *cli.Context) error { ReadTimeout: time.Second * 5, WriteTimeout: time.Second * 5, Name: ctx.String("user-agent"), - } + }) // Create the subscriber sub, err := messaging.NewSubscriber(ctx.String("event-srv-uri")) @@ -84,22 +86,23 @@ func execute(ctx *cli.Context) error { return nil } -func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) messaging.MsgHandler { +func handleMessage(httpClient http.Client, allowedContentTypes []string) messaging.MsgHandler { return func(sub messaging.Subscriber, msg io.Reader) error { var urlMsg messaging.URLTodoMsg if err := sub.ReadMsg(msg, &urlMsg); err != nil { return err } - body, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes) + body, headers, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes) if err != nil { return fmt.Errorf("error while crawling URL: %s", err) } // Publish resource body res := messaging.NewResourceMsg{ - URL: urlMsg.URL, - Body: body, + URL: urlMsg.URL, + Body: body, + Headers: headers, } if err := sub.PublishMsg(&res); err != nil { return fmt.Errorf("error while publishing resource: %s", err) @@ -109,34 +112,17 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) me } } -func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []string) (string, error) { +func crawURL(httpClient http.Client, url string, allowedContentTypes []string) (string, map[string]string, error) { log.Debug().Str("url", url).Msg("Processing URL") - // Query the website - req := fasthttp.AcquireRequest() - resp := fasthttp.AcquireResponse() - defer fasthttp.ReleaseRequest(req) - defer fasthttp.ReleaseResponse(resp) - - req.SetRequestURI(url) - - if err := httpClient.Do(req, resp); err != nil { - return "", err - } - - switch code := resp.StatusCode(); { - case code > 302: - return "", fmt.Errorf("non-managed error code %d", code) - // follow redirect - case code == 301 || code == 302: - if location := string(resp.Header.Peek("Location")); location != "" { - return crawURL(httpClient, location, allowedContentTypes) - } + r, err := httpClient.Get(url) + if err != nil { + return "", nil, err } // Determinate if content type is allowed allowed := false - contentType := string(resp.Header.Peek("Content-Type")) + contentType := r.Headers()["Content-Type"] for _, allowedContentType := range allowedContentTypes { if strings.Contains(contentType, allowedContentType) { allowed = true @@ -146,8 +132,13 @@ func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []stri if !allowed { err := fmt.Errorf("forbidden content type : %s", contentType) - return "", err + return "", nil, err } - return string(resp.Body()), nil + // Ready body + b, err := ioutil.ReadAll(r.Body()) + if err != nil { + return "", nil, err + } + return string(b), r.Headers(), nil } diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go index c59d0de..863cdbf 100644 --- a/internal/crawler/crawler_test.go +++ b/internal/crawler/crawler_test.go @@ -1 +1,118 @@ package crawler + +import ( + "bytes" + "github.com/creekorful/trandoshan/internal/http_mock" + "github.com/creekorful/trandoshan/internal/messaging" + "github.com/creekorful/trandoshan/internal/messaging_mock" + "github.com/golang/mock/gomock" + "strings" + "testing" +) + +func TestCrawlURLForbiddenContentType(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + httpClientMock := http_mock.NewMockClient(mockCtrl) + url := "https://example.onion" + allowedContentTypes := []string{"text/plain"} + + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + httpResponseMock.EXPECT().Headers().Return(map[string]string{"Content-Type": "image/png"}) + + httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil) + + body, headers, err := crawURL(httpClientMock, url, allowedContentTypes) + if body != "" || headers != nil || err == nil { + t.Fail() + } +} + +func TestCrawlURLSameContentType(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + httpClientMock := http_mock.NewMockClient(mockCtrl) + url := "https://example.onion" + allowedContentTypes := []string{"text/plain"} + + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"}) + httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello")) + + httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil) + + body, headers, err := crawURL(httpClientMock, url, allowedContentTypes) + if err != nil { + t.Fail() + } + if body != "Hello" { + t.Fail() + } + if len(headers) != 1 { + t.Fail() + } + if headers["Content-Type"] != "text/plain" { + t.Fail() + } +} + +func TestCrawlURLNoContentTypeFiltering(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + httpClientMock := http_mock.NewMockClient(mockCtrl) + url := "https://example.onion" + allowedContentTypes := []string{""} + + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"}) + httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello")) + + httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil) + + body, headers, err := crawURL(httpClientMock, url, allowedContentTypes) + if err != nil { + t.Fail() + } + if body != "Hello" { + t.Fail() + } + if len(headers) != 1 { + t.Fail() + } + if headers["Content-Type"] != "text/plain" { + t.Fail() + } +} + +func TestHandleMessage(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl) + httpClientMock := http_mock.NewMockClient(mockCtrl) + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + + msg := bytes.NewReader(nil) + subscriberMock.EXPECT(). + ReadMsg(msg, &messaging.URLTodoMsg{}). + SetArg(1, messaging.URLTodoMsg{URL: "https://example.onion/image.png?id=12&test=2"}). + Return(nil) + + httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain", "Server": "Debian"}) + httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello")) + + httpClientMock.EXPECT().Get("https://example.onion/image.png?id=12&test=2").Return(httpResponseMock, nil) + + subscriberMock.EXPECT().PublishMsg(&messaging.NewResourceMsg{ + URL: "https://example.onion/image.png?id=12&test=2", + Body: "Hello", + Headers: map[string]string{"Content-Type": "text/plain", "Server": "Debian"}, + }).Return(nil) + + if err := handleMessage(httpClientMock, []string{"text/plain", "text/css"})(subscriberMock, msg); err != nil { + t.Fail() + } +} diff --git a/internal/extractor/extractor.go b/internal/extractor/extractor.go index 8acb8d4..cb5ba4c 100644 --- a/internal/extractor/extractor.go +++ b/internal/extractor/extractor.go @@ -79,6 +79,7 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler { if err != nil { return fmt.Errorf("error while extracting resource: %s", err) } + resDto.Headers = resMsg.Headers // Submit to the API _, err = apiClient.AddResource(resDto) @@ -87,7 +88,15 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler { } // Finally push found URLs + publishedURLS := map[string]string{} for _, url := range urls { + if _, exist := publishedURLS[url]; exist { + log.Trace(). + Str("url", url). + Msg("Skipping duplicate URL") + continue + } + log.Trace(). Str("url", url). Msg("Publishing found URL") @@ -98,6 +107,8 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler { Str("err", err.Error()). Msg("Error while publishing URL") } + + publishedURLS[url] = url } return nil diff --git a/internal/extractor/extractor_test.go b/internal/extractor/extractor_test.go index 185d95b..4754fc1 100644 --- a/internal/extractor/extractor_test.go +++ b/internal/extractor/extractor_test.go @@ -42,12 +42,15 @@ This is sparta t.Fail() } - if len(urls) == 0 { + if len(urls) != 2 { t.FailNow() } if urls[0] != "https://google.com/test?test=test" { t.Fail() } + if urls[1] != "https://example.org" { + t.Fail() + } if resDto.Description != "Zhello world" { t.Fail() @@ -77,7 +80,7 @@ func TestHandleMessage(t *testing.T) { body := ` Creekorful Inc -This is sparta +This is sparta (hosted on https://example.org) @@ -93,8 +96,11 @@ This is sparta msg := bytes.NewReader(nil) subscriberMock.EXPECT(). ReadMsg(msg, &messaging.NewResourceMsg{}). - SetArg(1, messaging.NewResourceMsg{URL: "https://example.onion", Body: body}). - Return(nil) + SetArg(1, messaging.NewResourceMsg{ + URL: "https://example.onion", + Body: body, + Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"}, + }).Return(nil) // make sure we are creating the resource apiClientMock.EXPECT().AddResource(&resMatcher{target: api.ResourceDto{ @@ -103,9 +109,12 @@ This is sparta Title: "Creekorful Inc", Meta: map[string]string{"description": "Zhello world", "og:url": "https://example.org"}, Description: "Zhello world", + Headers: map[string]string{"Server": "Traefik", "Content-Type": "application/html"}, }}).Return(api.ResourceDto{}, nil) // make sure we are pushing found URLs + + // should be called only one time subscriberMock.EXPECT(). PublishMsg(&messaging.URLFoundMsg{URL: "https://example.org"}). Return(nil) @@ -118,7 +127,7 @@ This is sparta } } -// custom matcher to ignore time field when doing comparison +// custom matcher to ignore time field when doing comparison ;( // todo: do less crappy? type resMatcher struct { target api.ResourceDto @@ -131,7 +140,9 @@ func (rm *resMatcher) Matches(x interface{}) bool { arg.URL == rm.target.URL && arg.Body == rm.target.Body && arg.Description == rm.target.Description && - exactMatch(arg.Meta, rm.target.Meta) + exactMatch(arg.Meta, rm.target.Meta) && + arg.Headers["Server"][0] == rm.target.Headers["Server"][0] && + arg.Headers["Content-Type"] == rm.target.Headers["Content-Type"] // TODO allow other headers comparison } func (rm *resMatcher) String() string { diff --git a/internal/http/client.go b/internal/http/client.go new file mode 100644 index 0000000..6dcff01 --- /dev/null +++ b/internal/http/client.go @@ -0,0 +1,52 @@ +package http + +//go:generate mockgen -destination=../http_mock/client_mock.go -package=http_mock . Client + +import ( + "fmt" + "github.com/valyala/fasthttp" +) + +// Client is an HTTP client +type Client interface { + // Get the corresponding URL + // this methods follows redirections + Get(URL string) (Response, error) +} + +type client struct { + c *fasthttp.Client +} + +// NewFastHTTPClient create a new Client using fasthttp.Client as backend +func NewFastHTTPClient(c *fasthttp.Client) Client { + return &client{c: c} +} + +func (c *client) Get(URL string) (Response, error) { + req := fasthttp.AcquireRequest() + resp := fasthttp.AcquireResponse() + defer fasthttp.ReleaseRequest(req) + defer fasthttp.ReleaseResponse(resp) + + req.SetRequestURI(URL) + + if err := c.c.Do(req, resp); err != nil { + return nil, err + } + + switch code := resp.StatusCode(); { + case code > 302: + return nil, fmt.Errorf("non-managed error code %d", code) + // follow redirect + case code == 301 || code == 302: + if location := string(resp.Header.Peek("Location")); location != "" { + return c.Get(location) + } + } + + r := &response{} + resp.CopyTo(&r.raw) + + return r, nil +} diff --git a/internal/http/response.go b/internal/http/response.go new file mode 100644 index 0000000..03e743c --- /dev/null +++ b/internal/http/response.go @@ -0,0 +1,33 @@ +package http + +//go:generate mockgen -destination=../http_mock/response_mock.go -package=http_mock . Response + +import ( + "bytes" + "github.com/valyala/fasthttp" + "io" +) + +// Response is an HTTP response +type Response interface { + // Headers returns the response headers + Headers() map[string]string + // Body return the response body + Body() io.Reader +} + +type response struct { + raw fasthttp.Response +} + +func (r *response) Headers() map[string]string { + headers := map[string]string{} + r.raw.Header.VisitAll(func(key, value []byte) { + headers[string(key)] = string(value) // TODO manage multiple values? + }) + return headers +} + +func (r *response) Body() io.Reader { + return bytes.NewReader(r.raw.Body()) +} diff --git a/internal/messaging/messaging.go b/internal/messaging/messaging.go index 7501846..8edd3c3 100644 --- a/internal/messaging/messaging.go +++ b/internal/messaging/messaging.go @@ -39,8 +39,9 @@ func (msg *URLFoundMsg) Subject() string { // NewResourceMsg represent a crawled resource type NewResourceMsg struct { - URL string `json:"url"` - Body string `json:"body"` + URL string `json:"url"` + Body string `json:"body"` + Headers map[string]string `json:"headers"` } // Subject returns the subject where message should be push diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go index d0503ef..3e6c92b 100644 --- a/internal/scheduler/scheduler.go +++ b/internal/scheduler/scheduler.go @@ -92,6 +92,12 @@ func handleMessage(apiClient api.Client, refreshDelay time.Duration, forbiddenEx return nil // Technically not an error } + // Make sure protocol is allowed + if !strings.HasPrefix(u.Scheme, "http") { + log.Trace().Stringer("url", u).Msg("URL has invalid scheme") + return nil // Technically not an error + } + // Make sure extension is not forbidden for _, ext := range forbiddenExtensions { if strings.HasSuffix(u.Path, "."+ext) { diff --git a/internal/scheduler/scheduler_test.go b/internal/scheduler/scheduler_test.go index 333741a..85ca522 100644 --- a/internal/scheduler/scheduler_test.go +++ b/internal/scheduler/scheduler_test.go @@ -2,6 +2,7 @@ package scheduler import ( "bytes" + "fmt" "github.com/creekorful/trandoshan/api" "github.com/creekorful/trandoshan/api_mock" "github.com/creekorful/trandoshan/internal/messaging" @@ -47,6 +48,27 @@ func TestHandleMessageNotOnion(t *testing.T) { } } +func TestHandleMessageWrongProtocol(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + apiClientMock := api_mock.NewMockClient(mockCtrl) + subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl) + + msg := bytes.NewReader(nil) + + for _, protocol := range []string{"irc", "ftp"} { + subscriberMock.EXPECT(). + ReadMsg(msg, &messaging.URLFoundMsg{}). + SetArg(1, messaging.URLFoundMsg{URL: fmt.Sprintf("%s://example.onion", protocol)}). + Return(nil) + + if err := handleMessage(apiClientMock, -1, []string{})(subscriberMock, msg); err != nil { + t.FailNow() + } + } +} + func TestHandleMessageAlreadyCrawled(t *testing.T) { mockCtrl := gomock.NewController(t) defer mockCtrl.Finish()