Merge pull request #140 from creekorful/develop

Release 1.0.0-rc1
pull/143/head
Aloïs Micard 3 years ago committed by GitHub
commit 0cdf4811b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,26 @@
name: Continuous Delivery
on:
push:
tags:
- '*'
jobs:
goreleaser:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Go
uses: actions/setup-go@v2
with:
go-version: 1.14
- name: Run GoReleaser
uses: goreleaser/goreleaser-action@v2
with:
version: latest
args: release --rm-dist
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

@ -1,4 +1,4 @@
name: CI
name: Continuous Integration
on:
push:

3
.gitignore vendored

@ -1,2 +1,3 @@
.idea/
**/**_mock.go
**/**_mock.go
dist/

@ -0,0 +1,96 @@
before:
hooks:
- go mod download
builds:
- id: bs-blacklister
main: ./cmd/bs-blacklister/bs-blacklister.go
binary: bs-blacklister
goos:
- linux
goarch:
- amd64
- id: bs-configapi
main: ./cmd/bs-configapi/bs-configapi.go
binary: bs-configapi
goos:
- linux
goarch:
- amd64
- id: bs-crawler
main: ./cmd/bs-crawler/bs-crawler.go
binary: bs-crawler
goos:
- linux
goarch:
- amd64
- id: bs-indexer
main: ./cmd/bs-indexer/bs-indexer.go
binary: bs-indexer
goos:
- linux
goarch:
- amd64
- id: bs-scheduler
main: ./cmd/bs-scheduler/bs-scheduler.go
binary: bs-scheduler
goos:
- linux
goarch:
- amd64
dockers:
- goos: linux
goarch: amd64
binaries:
- bs-blacklister
image_templates:
- "creekorful/bs-blacklister:latest"
- "creekorful/bs-blacklister:{{ replace .Tag \"v\" \"\" }}"
- "creekorful/bs-blacklister:{{ .Major }}"
skip_push: false
dockerfile: build/docker/Dockerfile.blacklister
- goos: linux
goarch: amd64
binaries:
- bs-configapi
image_templates:
- "creekorful/bs-configapi:latest"
- "creekorful/bs-configapi:{{ replace .Tag \"v\" \"\" }}"
- "creekorful/bs-configapi:{{ .Major }}"
skip_push: false
dockerfile: build/docker/Dockerfile.configapi
- goos: linux
goarch: amd64
binaries:
- bs-crawler
image_templates:
- "creekorful/bs-crawler:latest"
- "creekorful/bs-crawler:{{ replace .Tag \"v\" \"\" }}"
- "creekorful/bs-crawler:{{ .Major }}"
skip_push: false
dockerfile: build/docker/Dockerfile.crawler
- goos: linux
goarch: amd64
binaries:
- bs-indexer
image_templates:
- "creekorful/bs-indexer:latest"
- "creekorful/bs-indexer:{{ replace .Tag \"v\" \"\" }}"
- "creekorful/bs-indexer:{{ .Major }}"
skip_push: false
dockerfile: build/docker/Dockerfile.indexer
- goos: linux
goarch: amd64
binaries:
- bs-scheduler
image_templates:
- "creekorful/bs-scheduler:latest"
- "creekorful/bs-scheduler:{{ replace .Tag \"v\" \"\" }}"
- "creekorful/bs-scheduler:{{ .Major }}"
skip_push: false
dockerfile: build/docker/Dockerfile.scheduler
checksum:
name_template: 'checksums.txt'
snapshot:
name_template: "{{ .Tag }}-{{ .ShortCommit }}"
release:
prerelease: true

@ -0,0 +1,16 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres
to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.0.0-rc1] - 2021-01-12
Initial stable release candidate.
[unreleased]: https://github.com/creekorful/bathyscaphe/compare/v1.0.0-rc1...HEAD
[v1.0.0-rc1]: https://github.com/creekorful/bathyscaphe/releases/tag/v1.0.0-rc1

@ -1,17 +1,8 @@
# Trandoshan dark web crawler
# Bathyscaphe dark web crawler
![CI](https://github.com/creekorful/trandoshan/workflows/CI/badge.svg)
![CI](https://github.com/creekorful/bathyscaphe/workflows/CI/badge.svg)
This repository is a complete rewrite of the Trandoshan dark web crawler. Everything has been written inside a single
Git repository to ease maintenance.
## Why a rewrite?
The first version of Trandoshan [(available here)](https://github.com/trandoshan-io) is working great but not really
professional, the code start to be a mess, hard to manage since split in multiple repositories, etc.
I have therefore decided to create & maintain the project in this specific repository, where all components code will be
available (as a Go module).
Bathyscaphe is a Go written, fast, highly configurable, cloud-native dark web crawler.
# How to start the crawler
@ -30,7 +21,8 @@ and wait for all containers to start.
# How to initiate crawling
One can use the RabbitMQ dashhboard available at localhost:15003, and publish a new JSON object in the **crawlingQueue**.
One can use the RabbitMQ dashhboard available at localhost:15003, and publish a new JSON object in the **crawlingQueue**
.
The object should look like this:
@ -62,10 +54,10 @@ If you've made a change to one of the crawler component and wish to use the upda
just need to issue the following command:
```sh
$ ./script/build.sh
$ goreleaser --snapshot --skip-publish --rm-dist
```
this will rebuild all crawler images using local changes. After that just run start.sh again to have the updated version
this will rebuild all images using local changes. After that just run start.sh again to have the updated version
running.
# Architecture

@ -0,0 +1,5 @@
FROM alpine:latest
ADD bs-blacklister /usr/bin/bs-blacklister
ENTRYPOINT ["/usr/bin/bs-blacklister"]

@ -0,0 +1,5 @@
FROM alpine:latest
ADD bs-configapi /usr/bin/bs-configapi
ENTRYPOINT ["/usr/bin/bs-configapi"]

@ -0,0 +1,5 @@
FROM alpine:latest
ADD bs-crawler /usr/bin/bs-crawler
ENTRYPOINT ["/usr/bin/bs-crawler"]

@ -0,0 +1,5 @@
FROM alpine:latest
ADD bs-indexer /usr/bin/bs-indexer
ENTRYPOINT ["/usr/bin/bs-indexer"]

@ -0,0 +1,5 @@
FROM alpine:latest
ADD bs-scheduler /usr/bin/bs-scheduler
ENTRYPOINT ["/usr/bin/bs-scheduler"]

@ -1,24 +0,0 @@
# build image
FROM golang:1.15.0-alpine as builder
RUN apk update && apk upgrade && \
apk add --no-cache bash git openssh
WORKDIR /app
# Copy and download dependencies to cache them and faster build time
COPY go.mod go.sum ./
RUN go mod download
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-blacklister
# runtime image
FROM alpine:latest
COPY --from=builder /app/tdsh-blacklister /app/
WORKDIR /app/
ENTRYPOINT ["./tdsh-blacklister"]

@ -1,24 +0,0 @@
# build image
FROM golang:1.15.0-alpine as builder
RUN apk update && apk upgrade && \
apk add --no-cache bash git openssh
WORKDIR /app
# Copy and download dependencies to cache them and faster build time
COPY go.mod go.sum ./
RUN go mod download
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-configapi
# runtime image
FROM alpine:latest
COPY --from=builder /app/tdsh-configapi /app/
WORKDIR /app/
ENTRYPOINT ["./tdsh-configapi"]

@ -1,24 +0,0 @@
# build image
FROM golang:1.15.0-alpine as builder
RUN apk update && apk upgrade && \
apk add --no-cache bash git openssh
WORKDIR /app
# Copy and download dependencies to cache them and faster build time
COPY go.mod go.sum ./
RUN go mod download
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-crawler
# runtime image
FROM alpine:latest
COPY --from=builder /app/tdsh-crawler /app/
WORKDIR /app/
ENTRYPOINT ["./tdsh-crawler"]

@ -1,24 +0,0 @@
# build image
FROM golang:1.15.0-alpine as builder
RUN apk update && apk upgrade && \
apk add --no-cache bash git openssh
WORKDIR /app
# Copy and download dependencies to cache them and faster build time
COPY go.mod go.sum ./
RUN go mod download
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-indexer
# runtime image
FROM alpine:latest
COPY --from=builder /app/tdsh-indexer /app/
WORKDIR /app/
ENTRYPOINT ["./tdsh-indexer"]

@ -1,24 +0,0 @@
# build image
FROM golang:1.15.0-alpine as builder
RUN apk update && apk upgrade && \
apk add --no-cache bash git openssh
WORKDIR /app
# Copy and download dependencies to cache them and faster build time
COPY go.mod go.sum ./
RUN go mod download
COPY . .
# Test then build app
RUN go build -v github.com/creekorful/trandoshan/cmd/tdsh-scheduler
# runtime image
FROM alpine:latest
COPY --from=builder /app/tdsh-scheduler /app/
WORKDIR /app/
ENTRYPOINT ["./tdsh-scheduler"]

@ -1,8 +1,8 @@
package main
import (
"github.com/creekorful/trandoshan/internal/blacklister"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/blacklister"
"github.com/creekorful/bathyscaphe/internal/process"
"os"
)

@ -1,8 +1,8 @@
package main
import (
"github.com/creekorful/trandoshan/internal/configapi"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/configapi"
"github.com/creekorful/bathyscaphe/internal/process"
"os"
)

@ -1,8 +1,8 @@
package main
import (
"github.com/creekorful/trandoshan/internal/crawler"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/crawler"
"github.com/creekorful/bathyscaphe/internal/process"
"os"
)

@ -1,8 +1,8 @@
package main
import (
"github.com/creekorful/trandoshan/internal/indexer"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/indexer"
"github.com/creekorful/bathyscaphe/internal/process"
"os"
)

@ -1,8 +1,8 @@
package main
import (
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/scheduler"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/scheduler"
"os"
)

@ -29,7 +29,7 @@ services:
volumes:
- redisdata:/data
crawler:
image: creekorful/tdsh-crawler:latest
image: creekorful/bs-crawler:latest
command: >
--log-level debug
--event-srv amqp://guest:guest@rabbitmq:5672
@ -41,20 +41,20 @@ services:
- torproxy
- configapi
scheduler:
image: creekorful/tdsh-scheduler:latest
image: creekorful/bs-scheduler:latest
command: >
--log-level debug
--event-srv amqp://guest:guest@rabbitmq:5672
--event-prefetch 20
--config-api http://configapi:8080
--redis redis:6379
--cache-srv redis://redis:6379
restart: always
depends_on:
- rabbitmq
- configapi
- redis
indexer-local:
image: creekorful/tdsh-indexer:latest
image: creekorful/bs-indexer:latest
command: >
--log-level debug
--event-srv amqp://guest:guest@rabbitmq:5672
@ -68,7 +68,7 @@ services:
- rabbitmq
- configapi
indexer-es:
image: creekorful/tdsh-indexer:latest
image: creekorful/bs-indexer:latest
command: >
--log-level debug
--event-srv amqp://guest:guest@rabbitmq:5672
@ -82,11 +82,11 @@ services:
- elasticsearch
- configapi
configapi:
image: creekorful/tdsh-configapi:latest
image: creekorful/bs-configapi:latest
command: >
--log-level debug
--event-srv amqp://guest:guest@rabbitmq:5672
--redis redis:6379
--cache-srv redis://redis:6379
--default-value forbidden-hostnames="[]"
--default-value allowed-mime-types="[{\"content-type\":\"text/\",\"extensions\":[\"html\",\"php\",\"aspx\", \"htm\"]}]"
--default-value refresh-delay="{\"delay\": 0}"
@ -98,12 +98,12 @@ services:
ports:
- 15006:8080
blacklister:
image: creekorful/tdsh-blacklister:latest
image: creekorful/bs-blacklister:latest
command: >
--log-level debug
--event-srv amqp://guest:guest@rabbitmq:5672
--config-api http://configapi:8080
--redis redis:6379
--cache-srv redis://redis:6379
--tor-proxy torproxy:9050
restart: always
depends_on:

@ -1,4 +1,4 @@
module github.com/creekorful/trandoshan
module github.com/creekorful/bathyscaphe
go 1.14
@ -6,12 +6,9 @@ require (
github.com/PuerkitoBio/goquery v1.6.0
github.com/PuerkitoBio/purell v1.1.1
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/go-redis/redis/v8 v8.4.4
github.com/go-resty/resty/v2 v2.3.0
github.com/golang/mock v1.4.4
github.com/gorilla/mux v1.8.0
github.com/olekukonko/tablewriter v0.0.4
github.com/olivere/elastic/v7 v7.0.20
github.com/rs/zerolog v1.20.0
github.com/streadway/amqp v1.0.0

@ -17,8 +17,6 @@ github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSY
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw=
@ -28,8 +26,6 @@ github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWo
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/go-redis/redis/v8 v8.4.4 h1:fGqgxCTR1sydaKI00oQf3OmkU/DIe/I/fYXvGklCIuc=
github.com/go-redis/redis/v8 v8.4.4/go.mod h1:nA0bQuF0i5JFx4Ta9RZxGKXFrQ8cRWntra97f0196iY=
github.com/go-resty/resty/v2 v2.3.0 h1:JOOeAvjSlapTT92p8xiS19Zxev1neGikoHsXJeOq8So=
github.com/go-resty/resty/v2 v2.3.0/go.mod h1:UpN9CgLZNsv4e9XG50UU8xdI0F43UQ4HmxLBDwaroHU=
github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
@ -66,12 +62,8 @@ github.com/klauspost/cpuid v1.2.1 h1:vJi+O/nMdFt0vqm8NZBI6wzALWdA2X+egi0ogNyrC/w
github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA=
github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-runewidth v0.0.7 h1:Ei8KR0497xHyKJPAv59M1dkC+rOZCMBJ+t3fZ+twI54=
github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78=
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8=
github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA=
github.com/olivere/elastic/v7 v7.0.20 h1:5FFpGPVJlBSlWBOdict406Y3yNTIpVpAiUvdFZeSbAo=
github.com/olivere/elastic/v7 v7.0.20/go.mod h1:Kh7iIsXIBl5qRQOBFoylCsXVTtye3keQU2Y/YbR7HD8=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
@ -138,8 +130,6 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297 h1:k7pJ2yAPLPgbskkFdhRCsA77k2fySZ1zf2zCjvQCiIM=
golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120 h1:EZ3cVSzKOlJxAd8e8YAJ7no8nNypTxexh/YE/xW3ZEY=
golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb h1:eBmm0M9fYhWpKZLjQUUKka/LtIxf46G4fxeEz5KJr9U=
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=

@ -2,11 +2,11 @@ package blacklister
import (
"fmt"
"github.com/creekorful/trandoshan/internal/cache"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/event"
chttp "github.com/creekorful/trandoshan/internal/http"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/cache"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/event"
chttp "github.com/creekorful/bathyscaphe/internal/http"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"net/http"
@ -27,6 +27,19 @@ func (state *State) Name() string {
return "blacklister"
}
// Description return the process description
func (state *State) Description() string {
return `
The blacklisting component. It consumes timeout URL event and will try to
crawl the hostname index page to determinate if the whole hostname does not
respond. If the hostname does not respond after a retry policy, it will
be blacklisted by the process and further crawling event involving the hostname
will be discarded by the crawling process. This allow us to not waste time
crawling for nothing.
This process consumes the 'url.timeout' event.`
}
// Features return the process features
func (state *State) Features() []process.Feature {
return []process.Feature{process.EventFeature, process.ConfigFeature, process.CacheFeature, process.CrawlingFeature}

@ -2,16 +2,16 @@ package blacklister
import (
"errors"
"github.com/creekorful/trandoshan/internal/cache_mock"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/creekorful/trandoshan/internal/http"
"github.com/creekorful/trandoshan/internal/http_mock"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/trandoshan/internal/test"
"github.com/creekorful/bathyscaphe/internal/cache_mock"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/configapi/client_mock"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/event_mock"
"github.com/creekorful/bathyscaphe/internal/http"
"github.com/creekorful/bathyscaphe/internal/http_mock"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/process_mock"
"github.com/creekorful/bathyscaphe/internal/test"
"github.com/golang/mock/gomock"
"testing"
"time"

@ -4,6 +4,9 @@ import (
"context"
"fmt"
"github.com/go-redis/redis/v8"
"net/url"
"strconv"
"strings"
"time"
)
@ -14,11 +17,13 @@ type redisCache struct {
// NewRedisCache return a new Cache using redis as backend
func NewRedisCache(URI string, keyPrefix string) (Cache, error) {
opts, err := parseRedisOpts(URI)
if err != nil {
return nil, err
}
return &redisCache{
client: redis.NewClient(&redis.Options{
Addr: URI,
DB: 0,
}),
client: redis.NewClient(opts),
keyPrefix: keyPrefix,
}, nil
}
@ -103,3 +108,35 @@ func (rc *redisCache) getKey(key string) string {
return fmt.Sprintf("%s:%s", rc.keyPrefix, key)
}
func parseRedisOpts(URL string) (*redis.Options, error) {
u, err := url.Parse(URL)
if err != nil {
return nil, err
}
username := "default"
password := ""
if u := u.User; u != nil {
if u.Username() != "" {
username = u.Username()
}
if pass, exist := u.Password(); exist {
password = pass
}
}
db := 0
if u.Path != "/" {
if val, err := strconv.Atoi(strings.TrimPrefix(u.Path, "/")); err == nil {
db = val
}
}
return &redis.Options{
Addr: u.Host,
Username: username,
Password: password,
DB: db,
}, nil
}

@ -13,3 +13,41 @@ func TestRedisCache_GetKey(t *testing.T) {
t.Errorf("got %s want %s", got, "config:user")
}
}
func TestParseRedisOpts(t *testing.T) {
opts, err := parseRedisOpts("redis://redis:6379")
if err != nil {
t.FailNow()
}
if opts.Username != "default" {
t.Errorf("wrong username: (got: %s, want: %s)\n", opts.Username, "default")
}
if opts.Password != "" {
t.Errorf("wrong password: (got: %s, want: %s)\n", opts.Password, "")
}
if opts.Addr != "redis:6379" {
t.Errorf("wrong addr: (got: %s, want: %s)\n", opts.Addr, "redis:6379")
}
if opts.DB != 0 {
t.Errorf("wrong DB: (got: %d, want: %d)\n", opts.DB, 0)
}
opts, err = parseRedisOpts("redis://default:password@redis:6379/42")
if err != nil {
t.FailNow()
}
if opts.Username != "default" {
t.Errorf("wrong username: (got: %s, want: %s)\n", opts.Username, "default")
}
if opts.Password != "password" {
t.Errorf("wrong password: (got: %s, want: %s)\n", opts.Password, "password")
}
if opts.Addr != "redis:6379" {
t.Errorf("wrong addr: (got: %s, want: %s)\n", opts.Addr, "redis:6379")
}
if opts.DB != 42 {
t.Errorf("wrong DB: (got: %d, want: %d)\n", opts.DB, 42)
}
}

@ -6,7 +6,7 @@ import (
"bytes"
"encoding/json"
"fmt"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/rs/zerolog/log"
"io/ioutil"
"net/http"

@ -1,8 +1,8 @@
package client
import (
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/event_mock"
"github.com/golang/mock/gomock"
"sync"
"testing"

@ -2,9 +2,9 @@ package configapi
import (
"fmt"
"github.com/creekorful/trandoshan/internal/cache"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/cache"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/gorilla/mux"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
@ -24,6 +24,19 @@ func (state *State) Name() string {
return "configapi"
}
// Description return the process description
func (state *State) Description() string {
return `
The ConfigAPI component. It serves as a centralized K/V database
with notification support.
This component expose a REST API to allow other process to retrieve
configuration as startup time, and to allow value update at runtime.
Each time a configuration is update trough the API, an event will
be dispatched so that running processes can update their local values.
This component produces the 'config' event.`
}
// Features return the process features
func (state *State) Features() []process.Feature {
return []process.Feature{process.EventFeature, process.CacheFeature}

@ -1,13 +1,13 @@
package configapi
import (
"github.com/creekorful/trandoshan/internal/cache"
"github.com/creekorful/trandoshan/internal/cache_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/trandoshan/internal/test"
"github.com/creekorful/bathyscaphe/internal/cache"
"github.com/creekorful/bathyscaphe/internal/cache_mock"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/event_mock"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/process_mock"
"github.com/creekorful/bathyscaphe/internal/test"
"github.com/golang/mock/gomock"
"github.com/gorilla/mux"
"io/ioutil"

@ -1,7 +1,7 @@
package constraint
import (
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"net/url"
"strings"
)

@ -1,8 +1,8 @@
package constraint
import (
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/configapi/client_mock"
"github.com/golang/mock/gomock"
"testing"
)

@ -2,12 +2,12 @@ package crawler
import (
"fmt"
"github.com/creekorful/trandoshan/internal/clock"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
chttp "github.com/creekorful/trandoshan/internal/http"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/clock"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/constraint"
"github.com/creekorful/bathyscaphe/internal/event"
chttp "github.com/creekorful/bathyscaphe/internal/http"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"io/ioutil"
@ -32,6 +32,17 @@ func (state *State) Name() string {
return "crawler"
}
// Description return the process description
func (state *State) Description() string {
return `
The crawling component. It consumes URL, crawl the resource, and
publish the result (page content + headers).
The crawler consumes the 'url.new' event and produces either:
- 'url.timeout' event if the crawling has failed because of timeout issue
- 'resource.new' event if the crawling has succeeded.`
}
// Features return the process features
func (state *State) Features() []process.Feature {
return []process.Feature{process.EventFeature, process.ConfigFeature, process.CrawlingFeature}

@ -2,16 +2,16 @@ package crawler
import (
"errors"
"github.com/creekorful/trandoshan/internal/clock_mock"
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/creekorful/trandoshan/internal/http"
"github.com/creekorful/trandoshan/internal/http_mock"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/trandoshan/internal/test"
"github.com/creekorful/bathyscaphe/internal/clock_mock"
"github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/configapi/client_mock"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/event_mock"
"github.com/creekorful/bathyscaphe/internal/http"
"github.com/creekorful/bathyscaphe/internal/http_mock"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/process_mock"
"github.com/creekorful/bathyscaphe/internal/test"
"github.com/golang/mock/gomock"
"strings"
"testing"

@ -1,7 +1,7 @@
package index
import (
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/bathyscaphe/internal/event"
"testing"
"time"
)

@ -2,11 +2,11 @@ package indexer
import (
"fmt"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/indexer/index"
"github.com/creekorful/trandoshan/internal/process"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/constraint"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/indexer/index"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"net/http"
@ -29,6 +29,15 @@ func (state *State) Name() string {
return "indexer"
}
// Description return the process description
func (state *State) Description() string {
return `
The indexing component. It consumes crawled resources, format
them and finally index them using the configured driver.
This component consumes the 'resource.new' event.`
}
// Features return the process features
func (state *State) Features() []process.Feature {
return []process.Feature{process.EventFeature, process.ConfigFeature}

@ -2,15 +2,15 @@ package indexer
import (
"errors"
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/creekorful/trandoshan/internal/indexer/index"
"github.com/creekorful/trandoshan/internal/indexer/index_mock"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/trandoshan/internal/test"
"github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/configapi/client_mock"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/event_mock"
"github.com/creekorful/bathyscaphe/internal/indexer/index"
"github.com/creekorful/bathyscaphe/internal/indexer/index_mock"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/process_mock"
"github.com/creekorful/bathyscaphe/internal/test"
"github.com/golang/mock/gomock"
"reflect"
"testing"

@ -6,11 +6,11 @@ import (
"context"
"crypto/tls"
"fmt"
"github.com/creekorful/trandoshan/internal/cache"
"github.com/creekorful/trandoshan/internal/clock"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/event"
chttp "github.com/creekorful/trandoshan/internal/http"
"github.com/creekorful/bathyscaphe/internal/cache"
"github.com/creekorful/bathyscaphe/internal/clock"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/event"
chttp "github.com/creekorful/bathyscaphe/internal/http"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
@ -27,7 +27,7 @@ import (
type Feature int
const (
version = "0.11.0"
version = "1.0.0-rc1"
// EventFeature is the feature to plug the process to the event server
EventFeature Feature = iota
@ -43,7 +43,7 @@ const (
eventURIFlag = "event-srv"
configAPIURIFlag = "config-api"
redisURIFlag = "redis"
cacheSRVFlag = "cache-srv"
torURIFlag = "tor-proxy"
userAgentFlag = "user-agent"
)
@ -101,7 +101,7 @@ func (p *defaultProvider) Publisher() (event.Publisher, error) {
}
func (p *defaultProvider) Cache(keyPrefix string) (cache.Cache, error) {
return cache.NewRedisCache(p.ctx.String(redisURIFlag), keyPrefix)
return cache.NewRedisCache(p.ctx.String(cacheSRVFlag), keyPrefix)
}
func (p *defaultProvider) HTTPClient() (chttp.Client, error) {
@ -135,9 +135,10 @@ type SubscriberDef struct {
Handler event.Handler
}
// Process is a component of Trandoshan
// Process is a component of Bathyscaphe
type Process interface {
Name() string
Description() string
Features() []Feature
CustomFlags() []cli.Flag
Initialize(provider Provider) error
@ -148,9 +149,10 @@ type Process interface {
// MakeApp return cli.App corresponding for given Process
func MakeApp(process Process) *cli.App {
app := &cli.App{
Name: fmt.Sprintf("tdsh-%s", process.Name()),
Version: version,
Usage: fmt.Sprintf("Trandoshan %s component", process.Name()),
Name: fmt.Sprintf("bs-%s", process.Name()),
Version: version,
Usage: fmt.Sprintf("Bathyscaphe %s component", process.Name()),
Description: process.Description(),
Flags: []cli.Flag{
&cli.StringFlag{
Name: "log-level",
@ -158,6 +160,12 @@ func MakeApp(process Process) *cli.App {
Value: "info",
},
},
Authors: []*cli.Author{
{
Name: "Aloïs Micard",
Email: "alois@micard.lu",
},
},
Action: execute(process),
}
@ -275,8 +283,8 @@ func getFeaturesFlags() map[Feature][]cli.Flag {
flags[CacheFeature] = []cli.Flag{
&cli.StringFlag{
Name: redisURIFlag,
Usage: "URI to the Redis server",
Name: cacheSRVFlag,
Usage: "URI to the cache server",
Required: true,
},
}

@ -4,11 +4,11 @@ import (
"errors"
"fmt"
"github.com/PuerkitoBio/purell"
"github.com/creekorful/trandoshan/internal/cache"
configapi "github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/constraint"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/bathyscaphe/internal/cache"
configapi "github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/constraint"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/rs/zerolog/log"
"github.com/urfave/cli/v2"
"hash/fnv"
@ -38,6 +38,18 @@ func (state *State) Name() string {
return "scheduler"
}
// Description return the process description
func (state *State) Description() string {
return `
The scheduling component. It extracts URLs from crawled resources
and apply a predicate to determinate if the URL is eligible
for crawling. If it is, it will publish a event and update the
scheduling cache.
This component consumes the 'resource.new' event and produces
the 'url.new' event.`
}
// Features return the process features
func (state *State) Features() []process.Feature {
return []process.Feature{process.EventFeature, process.ConfigFeature, process.CacheFeature}
@ -92,7 +104,7 @@ func (state *State) handleNewResourceEvent(subscriber event.Subscriber, msg even
}
// We are working using URL hash to reduce memory consumption.
// See: https://github.com/creekorful/trandoshan/issues/130
// See: https://github.com/creekorful/bathyscaphe/issues/130
var urlHashes []string
for _, u := range urls {
c := fnv.New64()

@ -2,15 +2,15 @@ package scheduler
import (
"errors"
"github.com/creekorful/trandoshan/internal/cache"
"github.com/creekorful/trandoshan/internal/cache_mock"
"github.com/creekorful/trandoshan/internal/configapi/client"
"github.com/creekorful/trandoshan/internal/configapi/client_mock"
"github.com/creekorful/trandoshan/internal/event"
"github.com/creekorful/trandoshan/internal/event_mock"
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/trandoshan/internal/test"
"github.com/creekorful/bathyscaphe/internal/cache"
"github.com/creekorful/bathyscaphe/internal/cache_mock"
"github.com/creekorful/bathyscaphe/internal/configapi/client"
"github.com/creekorful/bathyscaphe/internal/configapi/client_mock"
"github.com/creekorful/bathyscaphe/internal/event"
"github.com/creekorful/bathyscaphe/internal/event_mock"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/process_mock"
"github.com/creekorful/bathyscaphe/internal/test"
"github.com/golang/mock/gomock"
"hash/fnv"
"strconv"

@ -1,8 +1,8 @@
package test
import (
"github.com/creekorful/trandoshan/internal/process"
"github.com/creekorful/trandoshan/internal/process_mock"
"github.com/creekorful/bathyscaphe/internal/process"
"github.com/creekorful/bathyscaphe/internal/process_mock"
"github.com/golang/mock/gomock"
"reflect"
"testing"

@ -1,13 +0,0 @@
#!/bin/bash
# set image tag if provided
tag="latest"
if [ "$1" ]; then
tag="$1"
fi
# build docker images
for path in build/docker/Dockerfile.*; do
name=$(echo "$path" | cut -d'.' -f2)
docker build . -f "$path" -t "creekorful/$name:$tag"
done

@ -1,13 +0,0 @@
#!/bin/bash
# set image tag if provided
tag="latest"
if [ "$1" ]; then
tag="$1"
fi
# push docker images
for path in build/docker/Dockerfile.*; do
name=$(echo "$path" | cut -d'.' -f2)
docker push "creekorful/$name:$tag"
done

@ -1,33 +0,0 @@
#!/bin/bash
# make sure we have passed a tag as version
if [ "$1" ]; then
tag="$1"
else
echo "correct usage ./release.sh <tag>"
exit 1
fi
# create signed tag
git tag -s "v$tag" -m "Release $tag"
# build the docker images
./scripts/build.sh "$tag" # create version tag
./scripts/build.sh # create latest tag
echo ""
echo ""
echo "Release $tag is ready!"
echo "Please validate the changes, and once everything is confirmed, run the following:"
echo ""
echo "Update the git repository:"
echo ""
echo "$ git push && git push --tags"
echo ""
echo "Update the docker images:"
echo ""
echo "$ ./scripts/push.sh $tag"
echo "$ ./scripts/push.sh"
echo ""
echo ""
echo "Happy hacking ;D"
Loading…
Cancel
Save