diff --git a/api/api.go b/api/api.go index a73a7ab..80eb1ec 100644 --- a/api/api.go +++ b/api/api.go @@ -31,6 +31,7 @@ type ResourceDto struct { Title string `json:"title"` Meta map[string]string `json:"meta"` Description string `json:"description"` + Headers map[string]string `json:"headers"` } // CredentialsDto represent the credential when logging in the API diff --git a/go.sum b/go.sum index d923ee5..3120c49 100644 --- a/go.sum +++ b/go.sum @@ -29,6 +29,7 @@ github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb github.com/golang/mock v1.4.4 h1:l75CXGRSwbaYNpl/Z2X1XIIAMSCquvXgpVZDhwEIJsc= github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= diff --git a/internal/api/database/database.go b/internal/api/database/database.go index dbd89d4..98cf013 100644 --- a/internal/api/database/database.go +++ b/internal/api/database/database.go @@ -20,6 +20,7 @@ type ResourceIdx struct { Title string `json:"title"` Meta map[string]string `json:"meta"` Description string `json:"description"` + Headers map[string]string `json:"headers"` } // ResSearchParams is the search params used @@ -32,6 +33,7 @@ type ResSearchParams struct { PageSize int PageNumber int // TODO allow searching by meta + // TODO allow searching by headers } // Database is the interface used to abstract communication diff --git a/internal/api/service.go b/internal/api/service.go index a900d01..ae1de56 100644 --- a/internal/api/service.go +++ b/internal/api/service.go @@ -78,6 +78,7 @@ func (s *svc) addResource(res api.ResourceDto) (api.ResourceDto, error) { Title: res.Title, Meta: res.Meta, Description: res.Description, + Headers: res.Headers, } if err := s.db.AddResource(doc); err != nil { diff --git a/internal/api/service_test.go b/internal/api/service_test.go index 79fd53f..4714bbb 100644 --- a/internal/api/service_test.go +++ b/internal/api/service_test.go @@ -62,6 +62,7 @@ func TestAddResource(t *testing.T) { Time: time.Time{}, Meta: map[string]string{"content": "content-meta"}, Description: "the description", + Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"}, }) s := svc{db: dbMock} @@ -73,6 +74,7 @@ func TestAddResource(t *testing.T) { Time: time.Time{}, Meta: map[string]string{"content": "content-meta"}, Description: "the description", + Headers: map[string]string{"Content-Type": "application/html", "Server": "Traefik"}, }) if err != nil { t.FailNow() @@ -96,6 +98,12 @@ func TestAddResource(t *testing.T) { if res.Description != "the description" { t.FailNow() } + if res.Headers["Content-Type"] != "application/html" { + t.FailNow() + } + if res.Headers["Server"] != "Traefik" { + t.FailNow() + } } func TestScheduleURL(t *testing.T) { diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index 5581130..28bdb10 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -3,6 +3,7 @@ package crawler import ( "crypto/tls" "fmt" + "github.com/creekorful/trandoshan/internal/http" "github.com/creekorful/trandoshan/internal/logging" "github.com/creekorful/trandoshan/internal/messaging" "github.com/creekorful/trandoshan/internal/util" @@ -11,6 +12,7 @@ import ( "github.com/valyala/fasthttp" "github.com/valyala/fasthttp/fasthttpproxy" "io" + "io/ioutil" "strings" "time" ) @@ -57,7 +59,7 @@ func execute(ctx *cli.Context) error { Msg("Starting tdsh-crawler") // Create the HTTP client - httpClient := &fasthttp.Client{ + httpClient := http.NewFastHTTPClient(&fasthttp.Client{ // Use given TOR proxy to reach the hidden services Dial: fasthttpproxy.FasthttpSocksDialer(ctx.String("tor-uri")), // Disable SSL verification since we do not really care about this @@ -65,7 +67,7 @@ func execute(ctx *cli.Context) error { ReadTimeout: time.Second * 5, WriteTimeout: time.Second * 5, Name: ctx.String("user-agent"), - } + }) // Create the subscriber sub, err := messaging.NewSubscriber(ctx.String("event-srv-uri")) @@ -84,22 +86,23 @@ func execute(ctx *cli.Context) error { return nil } -func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) messaging.MsgHandler { +func handleMessage(httpClient http.Client, allowedContentTypes []string) messaging.MsgHandler { return func(sub messaging.Subscriber, msg io.Reader) error { var urlMsg messaging.URLTodoMsg if err := sub.ReadMsg(msg, &urlMsg); err != nil { return err } - body, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes) + body, headers, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes) if err != nil { return fmt.Errorf("error while crawling URL: %s", err) } // Publish resource body res := messaging.NewResourceMsg{ - URL: urlMsg.URL, - Body: body, + URL: urlMsg.URL, + Body: body, + Headers: headers, } if err := sub.PublishMsg(&res); err != nil { return fmt.Errorf("error while publishing resource: %s", err) @@ -109,34 +112,17 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) me } } -func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []string) (string, error) { +func crawURL(httpClient http.Client, url string, allowedContentTypes []string) (string, map[string]string, error) { log.Debug().Str("url", url).Msg("Processing URL") - // Query the website - req := fasthttp.AcquireRequest() - resp := fasthttp.AcquireResponse() - defer fasthttp.ReleaseRequest(req) - defer fasthttp.ReleaseResponse(resp) - - req.SetRequestURI(url) - - if err := httpClient.Do(req, resp); err != nil { - return "", err - } - - switch code := resp.StatusCode(); { - case code > 302: - return "", fmt.Errorf("non-managed error code %d", code) - // follow redirect - case code == 301 || code == 302: - if location := string(resp.Header.Peek("Location")); location != "" { - return crawURL(httpClient, location, allowedContentTypes) - } + r, err := httpClient.Get(url) + if err != nil { + return "", nil, err } // Determinate if content type is allowed allowed := false - contentType := string(resp.Header.Peek("Content-Type")) + contentType := r.Headers()["Content-Type"] for _, allowedContentType := range allowedContentTypes { if strings.Contains(contentType, allowedContentType) { allowed = true @@ -146,8 +132,13 @@ func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []stri if !allowed { err := fmt.Errorf("forbidden content type : %s", contentType) - return "", err + return "", nil, err } - return string(resp.Body()), nil + // Ready body + b, err := ioutil.ReadAll(r.Body()) + if err != nil { + return "", nil, err + } + return string(b), r.Headers(), nil } diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go index c59d0de..863cdbf 100644 --- a/internal/crawler/crawler_test.go +++ b/internal/crawler/crawler_test.go @@ -1 +1,118 @@ package crawler + +import ( + "bytes" + "github.com/creekorful/trandoshan/internal/http_mock" + "github.com/creekorful/trandoshan/internal/messaging" + "github.com/creekorful/trandoshan/internal/messaging_mock" + "github.com/golang/mock/gomock" + "strings" + "testing" +) + +func TestCrawlURLForbiddenContentType(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + httpClientMock := http_mock.NewMockClient(mockCtrl) + url := "https://example.onion" + allowedContentTypes := []string{"text/plain"} + + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + httpResponseMock.EXPECT().Headers().Return(map[string]string{"Content-Type": "image/png"}) + + httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil) + + body, headers, err := crawURL(httpClientMock, url, allowedContentTypes) + if body != "" || headers != nil || err == nil { + t.Fail() + } +} + +func TestCrawlURLSameContentType(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + httpClientMock := http_mock.NewMockClient(mockCtrl) + url := "https://example.onion" + allowedContentTypes := []string{"text/plain"} + + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"}) + httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello")) + + httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil) + + body, headers, err := crawURL(httpClientMock, url, allowedContentTypes) + if err != nil { + t.Fail() + } + if body != "Hello" { + t.Fail() + } + if len(headers) != 1 { + t.Fail() + } + if headers["Content-Type"] != "text/plain" { + t.Fail() + } +} + +func TestCrawlURLNoContentTypeFiltering(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + httpClientMock := http_mock.NewMockClient(mockCtrl) + url := "https://example.onion" + allowedContentTypes := []string{""} + + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain"}) + httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello")) + + httpClientMock.EXPECT().Get(url).Return(httpResponseMock, nil) + + body, headers, err := crawURL(httpClientMock, url, allowedContentTypes) + if err != nil { + t.Fail() + } + if body != "Hello" { + t.Fail() + } + if len(headers) != 1 { + t.Fail() + } + if headers["Content-Type"] != "text/plain" { + t.Fail() + } +} + +func TestHandleMessage(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + subscriberMock := messaging_mock.NewMockSubscriber(mockCtrl) + httpClientMock := http_mock.NewMockClient(mockCtrl) + httpResponseMock := http_mock.NewMockResponse(mockCtrl) + + msg := bytes.NewReader(nil) + subscriberMock.EXPECT(). + ReadMsg(msg, &messaging.URLTodoMsg{}). + SetArg(1, messaging.URLTodoMsg{URL: "https://example.onion/image.png?id=12&test=2"}). + Return(nil) + + httpResponseMock.EXPECT().Headers().Times(2).Return(map[string]string{"Content-Type": "text/plain", "Server": "Debian"}) + httpResponseMock.EXPECT().Body().Return(strings.NewReader("Hello")) + + httpClientMock.EXPECT().Get("https://example.onion/image.png?id=12&test=2").Return(httpResponseMock, nil) + + subscriberMock.EXPECT().PublishMsg(&messaging.NewResourceMsg{ + URL: "https://example.onion/image.png?id=12&test=2", + Body: "Hello", + Headers: map[string]string{"Content-Type": "text/plain", "Server": "Debian"}, + }).Return(nil) + + if err := handleMessage(httpClientMock, []string{"text/plain", "text/css"})(subscriberMock, msg); err != nil { + t.Fail() + } +} diff --git a/internal/extractor/extractor.go b/internal/extractor/extractor.go index 8acb8d4..cb5ba4c 100644 --- a/internal/extractor/extractor.go +++ b/internal/extractor/extractor.go @@ -79,6 +79,7 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler { if err != nil { return fmt.Errorf("error while extracting resource: %s", err) } + resDto.Headers = resMsg.Headers // Submit to the API _, err = apiClient.AddResource(resDto) @@ -87,7 +88,15 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler { } // Finally push found URLs + publishedURLS := map[string]string{} for _, url := range urls { + if _, exist := publishedURLS[url]; exist { + log.Trace(). + Str("url", url). + Msg("Skipping duplicate URL") + continue + } + log.Trace(). Str("url", url). Msg("Publishing found URL") @@ -98,6 +107,8 @@ func handleMessage(apiClient api.Client) messaging.MsgHandler { Str("err", err.Error()). Msg("Error while publishing URL") } + + publishedURLS[url] = url } return nil diff --git a/internal/extractor/extractor_test.go b/internal/extractor/extractor_test.go index 185d95b..4754fc1 100644 --- a/internal/extractor/extractor_test.go +++ b/internal/extractor/extractor_test.go @@ -42,12 +42,15 @@ This is sparta t.Fail() } - if len(urls) == 0 { + if len(urls) != 2 { t.FailNow() } if urls[0] != "https://google.com/test?test=test" { t.Fail() } + if urls[1] != "https://example.org" { + t.Fail() + } if resDto.Description != "Zhello world" { t.Fail() @@ -77,7 +80,7 @@ func TestHandleMessage(t *testing.T) { body := `