Now follow redirect

pull/24/head
Aloïs Micard 4 years ago
parent a74003a818
commit 0e6477dd0a
No known key found for this signature in database
GPG Key ID: 1A0EB82F071F5EFE

@ -2,6 +2,7 @@ package crawler
import (
"crypto/tls"
"fmt"
"github.com/creekorful/trandoshan/internal/messaging"
"github.com/creekorful/trandoshan/internal/util/logging"
natsutil "github.com/creekorful/trandoshan/internal/util/nats"
@ -93,38 +94,12 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) na
return err
}
log.Debug().Str("url", urlMsg.URL).Msg("Processing URL")
// Query the website
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
req.SetRequestURI(urlMsg.URL)
if err := httpClient.Do(req, resp); err != nil {
log.Err(err).Msg("Error while crawling website")
body, err := crawURL(httpClient, urlMsg.URL, allowedContentTypes)
if err != nil {
log.Err(err).Str("url", urlMsg.URL).Msg("Error while crawling url")
return err
}
// Determinate if content type is allowed
allowed := false
contentType := string(resp.Header.Peek("Content-Type"))
for _, allowedContentType := range allowedContentTypes {
if strings.Contains(contentType, allowedContentType) {
allowed = true
break
}
}
if !allowed {
log.Debug().Str("content-type", contentType).Msg("Discarding forbidden content type")
return nil
}
body := string(resp.Body())
// Publish resource body
res := messaging.NewResourceMsg{
URL: urlMsg.URL,
@ -137,3 +112,44 @@ func handleMessage(httpClient *fasthttp.Client, allowedContentTypes []string) na
return nil
}
}
func crawURL(httpClient *fasthttp.Client, url string, allowedContentTypes []string) (string, error) {
log.Debug().Str("url", url).Msg("Processing URL")
// Query the website
req := fasthttp.AcquireRequest()
resp := fasthttp.AcquireResponse()
defer fasthttp.ReleaseRequest(req)
defer fasthttp.ReleaseResponse(resp)
req.SetRequestURI(url)
if err := httpClient.Do(req, resp); err != nil {
return "", err
}
switch code := resp.StatusCode(); {
// follow redirect
case code == 301 || code == 302:
if location := string(resp.Header.Peek("Location")); location != "" {
return crawURL(httpClient, location, allowedContentTypes)
}
}
// Determinate if content type is allowed
allowed := false
contentType := string(resp.Header.Peek("Content-Type"))
for _, allowedContentType := range allowedContentTypes {
if strings.Contains(contentType, allowedContentType) {
allowed = true
break
}
}
if !allowed {
err := fmt.Errorf("forbidden content type : %s", contentType)
return "", err
}
return string(resp.Body()), nil
}

Loading…
Cancel
Save