Done major code changes and reviews

pull/6/head
danieleperera 4 years ago
parent b76af6d57e
commit e2b7c8346d

1
.gitignore vendored

@ -1,3 +1,4 @@
monitoring.*
onion_master_list.*
webui
templates

@ -1,86 +0,0 @@
# This is an example ThreatIngestor config file with some preconfigured RSS
# sources, feeding extracted artifacts into a CSV file.
general:
# Run forever, check feeds once an hour.
daemon: True
sleep: 10
onion_validation: ([a-z2-7]{16,56}\.onion)
blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder,cocks,year,old
interestingKeywords: t.me,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach,bank,logs,hack
elasticsearch:
index: darkweb
port : 9200
host : 127.0.0.1
sources:
# A few threat intel blogs to get you started!
- name: simple-text-file
module: simplefile
filename: onion_master_list.txt
# - name: source-gist
# module: gist
# url: https://gist.github.com/search?l=Text&q=.onion
# - name: source-reddit
# module: reddit
# url: https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000
# feed_type: messy
#
# - name: pastebin
# module: pastebin-account
# url: https://gist.github.com/search?l=Text&q=.onion
# feed_type: messy
#
# - name: hunchly-report
# module: gmail-hunchly
# url: https://gist.github.com/search?l=Text&q=.onion
# feed_type: messy
#
# - name: onionland-search
# module: collect-onions
# url: http://3bbaaaccczcbdddz.onion/discover
# feed_type: messy
#
# - name: torch
# module: collect-onions
# url: http://xmh57jrzrnw6insl.onion
# feed_type: messy
operators:
- name: simple-html
module: html
socks5:
http: 'socks5h://127.0.0.1:9050'
https: 'socks5h://127.0.0.1:9050'
TorController:
port: 9051
password: your-torcontroller-password-here
- name: simple-screenshot
module: screenshot
screenshots_path: null
- name: onionscan-go
module: onionscan
binpath: /home/tony/go/bin/onionscan
# - name: yara-rule
# module: yara
# filename: categories.yar
# base_score: 50
#
# - name: regex-match
# module: regex
# keywords: test,test2
# base_score: 20
notifiers:
# Simple telegram notifier
- name: telegram-notifer
module: telegram
chat_id:
token:

@ -0,0 +1,106 @@
# This is an example OnionIngestor config file with some preconfigured configurations
# Storage Engines elasticsearch and telegram are configured
general:
# Run forever, check feeds once an hour.
daemon: True
sleep: 10
onion_validation: ([a-z2-7]{16,56}\.onion)
blacklist: pedo,porn,child
interestingKeywords: your,keywords,here
save-thread: no # Use a separate thread to save onions
monitor:
filename: monitoring.txt
sources:
# A few threat intel blogs to get you started!
- name: simple-text-file
module: simplefile
filename: onion_master_list.txt
# - name: source-gist
# module: gist
# url: https://gist.github.com/search?l=Text&q=.onion
# - name: source-reddit
# module: reddit
# url: https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000
# feed_type: messy
#
# - name: pastebin
# module: pastebin-account
# url: https://gist.github.com/search?l=Text&q=.onion
# feed_type: messy
#
# - name: hunchly-report
# module: gmail-hunchly
# url: https://gist.github.com/search?l=Text&q=.onion
# feed_type: messy
#
# - name: onionland-search
# module: collect-onions
# url: http://3bbaaaccczcbdddz.onion/discover
# feed_type: messy
#
# - name: torch
# module: collect-onions
# url: http://xmh57jrzrnw6insl.onion
# feed_type: messy
operators:
- name: simple-html
module: html
timeout: 300
retries: 2
interestingKeywords: your,keywords,here
socks5:
http: 'socks5h://127.0.0.1:9050'
https: 'socks5h://127.0.0.1:9050'
TorController:
port: 9051
password: your-tor-controller-password
- name: onionscan-go
module: onionscan
binpath: your-onionscan-binary-path
- name: simple-screenshot
module: screenshot
screenshots_path: null
# - name: yara-rule
# module: yara
# filename: categories.yar
# base_score: 50
#
# - name: regex-match
# module: regex
# keywords: test,test2
# base_score: 20
database_Engines:
- name: telegram-notifer #Simple Telegram notifier
module: telegram
chat_id: your-telegram-chat-id
token: your-telegram-token
- name: elasticsearch
module: elasticsearch
index: your-index-name
port : 9200
host : 127.0.0.1
# - name: email
# module: send_email
# alert: no # Enable/disable email alerts
# from: alert@example.com
# to: alert@example.com
# server: 127.0.0.1 # Address of the server (hostname or IP)
# port: 25 # Outgoing SMTP port: 25, 587, ...
# tls: no # Enable/disable tls support
# username: '' # (optional) Username for authentication. Leave blank for no authentication.
# password: '' # (optional) Password for authentication. Leave blank for no authentication.
# subject: '[onioningestor] - {subject}'
# size-limit: 1048576 # Size limit for pastie, above it's sent as attachement

@ -1,12 +1,15 @@
import sys
import time
import queue
import traceback
import threading
import collections
from queue import Queue
from . import config
from . import dbhandler
from . import loghandler
from onioningestor.databases import StorageDispatcher, StorageThread, StorageSync
class Ingestor:
"""ThreatIngestor main work logic.
@ -18,37 +21,51 @@ class Ingestor:
# Load logger
log = loghandler.LoggerHandler(args.logLevel)
self.logger = log.start_logging()
# Load config
self.config = config.Config(args.configFile, self.logger)
self.blacklist = self.config.blacklist()
# Load Elasticsearch.
try:
self.es = dbhandler.DbHandlerElasticSearch(
self.config.elasticsearch(),
self.logger)
except Exception as e:
# Error loading elasticsearch.
self.logger.error(e)
self.logger.debug(traceback.print_exc())
sys.exit(1)
# Create Queues
self.queue = self.config.monitorQueue()
# Get asynchronously o synchronously save
self.save_thread = self.config.save_thread()
# Instantiate plugins.
try:
self.logger.info("Initializing sources")
self.sources = {name: source(self.logger, **kwargs)
for name, source, kwargs in self.config.sources()}
# Track some statistics about artifacts in a summary object.
self.summary = collections.Counter()
self.logger.info("initializing operators")
self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs)
# Threads
self.threads = []
try:
# Load Storage Engines - ElasticSearch, Telegram, Twitter etc
self.storage = StorageDispatcher(self.logger)
for name, db, kwargs in self.config.database_engines():
# start the threads handling database storage if needed
if self.save_thread:
self.logger.debug(f"Starting daemon thread for {str(db)}")
t = StorageThread(db(self.logger, **kwargs))
self.threads.append(t)
t.setDaemon(True)
t.start()
# save onions synchronously
else:
s = StorageSync(db(self.logger, **kwargs))
self.storage.add_storage(s)
if self.save_thread:
self.logger.info("Onions will be saved asynchronously")
else:
self.logger.info("Onions will be saved synchronously")
# Instantiate operator plugins.
self.logger.debug("initializing operators")
self.operators = {name: operator(self.logger, **kwargs)
for name, operator, kwargs in self.config.operators()}
self.logger.info("initializing notifiers")
#self.notifiers = {name: operator(**kwargs)
# for name, operator, kwargs in self.config.notifiers()}
except Exception as e:
# Error loading elasticsearch.
# Error loading starting plugins.
self.logger.error(e)
self.logger.debug(traceback.print_exc())
sys.exit(1)
@ -56,51 +73,54 @@ class Ingestor:
def run(self):
"""Run once, or forever, depending on config."""
if self.config.daemon():
self.logger.info("Running forever, in a loop")
self.run_forever()
else:
self.logger.info("Running once, to completion")
self.run_once()
self.run_once()
#if self.config.daemon():
# self.logger.info("Running forever, in a loop")
# self.run_forever()
#else:
# self.logger.info("Running once, to completion")
# self.run_once()
def run_once(self):
"""Run each source once, passing artifacts to each operator."""
# Track some statistics about artifacts in a summary object.
summary = collections.Counter()
for source in self.sources:
# Run the source to collect artifacts.
self.logger.info(f"Running source '{source}'")
try:
# get the generator of onions
onions = self.sources[source].run()
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue
# Process onions with each operator.
for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'")
# Start collecting sources
self.collect_sources()
# Sources will fill various queues
# MonitorQueue has priority high
# OnionQueue are those found in clearnet medium
# crawlQueue are those found crawling onionlinks low
onions = list(self.queue.queue)
done = False
if onions:
while not done:
try:
self.operators[operator].process(onions)
# Save the source onion with collected data
## Process onions with each operator.
for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'")
# Process list of onions
self.operators[operator].process(onions)
done = True
## Save Onions for each storage
for onion in onions:
self.storage.save_pastie(onion[1], 30)
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue
# # Record stats and update the summary.
# types = artifact_types(doc.get('interestingKeywords'))
# summary.update(types)
# for artifact_type in types:
# self.logger.info(f'types[artifact_type]')
# Log the summary.
self.logger.info(f"New artifacts: {dict(summary)}")
break
except KeyboardInterrupt:
print('')
self.logger.info("Ctrl-c received! Sending kill to threads...")
for t in self.threads:
t.kill_received = True
self.logger.info('Exiting')
sys.exit(0)
else:
for t in self.threads:
t.kill_received = True
self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
time.sleep(self.config.sleep())
def run_forever(self):
@ -108,20 +128,22 @@ class Ingestor:
while True:
self.run_once()
self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
time.sleep(self.config.sleep())
def artifact_types(artifact_list):
"""Return a dictionary with counts of each artifact type."""
types = {}
for artifact in artifact_list:
artifact_type = artifact.__class__.__name__.lower()
if artifact_type in types:
types[artifact_type] += 1
else:
types[artifact_type] = 1
return types
def collect_sources(self):
self.logger.debug("Initializing sources")
for name, collect, kwargs in self.config.sources():
# Run the source to collect onion links from clear net.
self.logger.info(f"Running source '{name}'")
try:
# get the generator of onions
source = collect(self.logger, **kwargs)
source.set_onionQueue(self.queue) #priority 2
t = source.run()
self.threads.append(t)
#self.logger.info(f'Starting of thread: {t.currentThread().name}')
#t.start()
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue

@ -38,7 +38,7 @@ from onioningestor import Ingestor
# Load arguments from user
parser = argparse.ArgumentParser(
prog='onionscraper',
prog='onioningestor',
description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")

@ -1,22 +1,18 @@
import io
import importlib
import traceback
from queue import PriorityQueue
from collections import namedtuple
import yaml
from pathlib import Path
from onioningestor.onion import Onion
SOURCE = "onioningestor.sources"
OPERATOR = "onioningestor.operators"
INTERNAL_OPTIONS = [
"saved_state",
"module",
"credentials",
]
ARTIFACT_TYPES = "artifact_types"
FILTER_STRING = "filter"
ALLOWED_SOURCES = "allowed_sources"
DATABASE_ENGINE = "onioningestor.databases"
NAME = "name"
@ -43,16 +39,15 @@ class Config:
module = importlib.import_module(".".join([plugin_type, plugin]))
return module.Plugin
except Exception as e:
print(e)
print(traceback.print_exc())
self.logger.error(e)
self.logger.debug(traceback.print_exc())
def daemon(self):
"""Returns boolean, are we daemonizing?"""
return self.config["general"]["daemon"]
def elasticsearch(self):
"""Returns elasticsaerch config"""
return self.config["general"]["elasticsearch"]
def save_thread(self):
return self.config["general"].get("save-thread", False)
def sleep(self):
"""Returns number of seconds to sleep between iterations, if daemonizing."""
@ -61,36 +56,53 @@ class Config:
def blacklist(self):
return self.config["general"]["blacklist"].split(",")
# def onionscanner(self):
# """Returns onionscanner config dict"""
# screenshots = self.config['onionscanner'].pop('screenshots_path', None)
# if screenshots:
# self.config['onionscanner']['screenshots_path'] = Path(screenshots)
# else:
# self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
# blacklist = self.config['onionscanner'].pop('blacklist', None)
# if blacklist:
# self.config['onionscanner']['blacklist'] = blacklist.split(',')
# interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
# if interestingKeywords:
# self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
# return self.config['onionscanner']
def notifiers(self):
"""Returns notifiers config dictionary."""
return self.config.get("notifiers", {})
def monitorQueue(self):
fp = self.config["monitor"].get("filename", False)
q = PriorityQueue(maxsize=0)
if fp:
with open(fp, 'r') as f:
monitorOnions = f.read().splitlines()
for monitor in monitorOnions:
q.put((
1,
Onion(
url=monitor,
source='monitor',
type='domain',
status='offline',
monitor=True,
denylist=False)))
return q
else:
return None
def logging(self):
"""Returns logging config dictionary."""
return self.config.get("logging", {})
def credentials(self, credential_name):
"""Return a dictionary with the specified credentials."""
for credential in self.config["credentials"]:
for key, value in credential.items():
if key == NAME and value == credential_name:
return credential
return {}
def database_engines(self):
"""Return a list of (name, Source class, {kwargs}) tuples.
:raises: threatingestor.exceptions.PluginError
"""
engines = []
for engine in self.config["database_Engines"]:
kwargs = {}
for key, value in engine.items():
kwargs[key] = value
# load and initialize the plugin
self.logger.debug(f"Found database engine '{engine[NAME]}'")
kwargs.pop('module',None)
engines.append(
(
engine[NAME],
self._load_plugin(DATABASE_ENGINE, engine["module"]),
kwargs
)
)
self.logger.debug(f"Found {len(engines)} total database engines")
return engines
def sources(self):
"""Return a list of (name, Source class, {kwargs}) tuples.
@ -101,25 +113,19 @@ class Config:
for source in self.config["sources"]:
kwargs = {}
for key, value in source.items():
if key not in INTERNAL_OPTIONS:
kwargs[key] = value
elif key == "credentials":
# Grab these named credentials
credential_name = value
for credential_key, credential_value in self.credentials(
credential_name
).items():
if credential_key != NAME:
kwargs[credential_key] = credential_value
kwargs[key] = value
# load and initialize the plugin
self.logger.info(f"Found source '{source[NAME]}'")
self.logger.debug(f"Found source '{source[NAME]}'")
kwargs.pop('module',None)
sources.append(
(source[NAME], self._load_plugin(SOURCE, source["module"]), kwargs)
(
source[NAME],
self._load_plugin(SOURCE, source["module"]),
kwargs
)
)
self.logger.info(f"Found {len(sources)} total sources")
self.logger.debug(f"Found {len(sources)} total sources")
return sources
def operators(self):
@ -130,44 +136,10 @@ class Config:
for operator in self.config["operators"]:
kwargs = {}
for key, value in operator.items():
if key not in INTERNAL_OPTIONS:
if key == ARTIFACT_TYPES:
# parse out special artifact_types option
artifact_types = []
for artifact in value:
try:
artifact_types.append(
threatingestor.artifacts.STRING_MAP[
artifact.lower().strip()
]
)
except KeyError:
# ignore invalid artifact types
pass
kwargs[key] = artifact_types
elif key == FILTER_STRING:
# pass in special filter_string option
kwargs["filter_string"] = value
elif key == NAME:
# exclude name key from operator kwargs, since it's not used
pass
else:
kwargs[key] = value
elif key == "credentials":
# Grab these named credentials
credential_name = value
for credential_key, credential_value in self.credentials(
credential_name
).items():
if credential_key != NAME:
kwargs[credential_key] = credential_value
kwargs[key] = value
# load and initialize the plugin
self.logger.info(f"Found operator '{operator[NAME]}'")
self.logger.debug(f"Found operator '{operator[NAME]}'")
kwargs.pop('module',None)
operators.append(
(
operator[NAME],
@ -176,5 +148,5 @@ class Config:
)
)
self.logger.info(f"Found {len(operators)} total operators")
self.logger.debug(f"Found {len(operators)} total operators")
return operators

@ -0,0 +1,149 @@
import sys
import time
import schedule
import threading
class StorageScheduler():
def __init__(self, storage, **kwargs):
self.storage = storage
self.name = self.storage.name
def save_pastie(self, pastie, timeout):
raise NotImplementedError
class StorageSync(StorageScheduler):
### synchronously save onions ###
def save_pastie(self, pastie, timeout):
self.storage.save_pastie(pastie)
# LATER: implement an async class
class StorageThread(threading.Thread, StorageScheduler):
def __init__(self, logger, storage, **kwargs):
threading.Thread.__init__(self)
StorageScheduler.__init__(self, storage, **kwargs)
self.logger = logger
try:
size = int(kwargs['queue_size'])
except Exception:
size = 0
self.queue = Queue(size)
self.kill_received = False
def run(self):
self.logger.info('{0}: Thread for saving pasties started'.format(self.name))
# loop over the queue
while not self.kill_received:
# pastie = None
try:
# grabs pastie from queue
pastie = self.queue.get(True, 5)
# save the pasties in each storage
self.storage.save_pastie(pastie)
except Empty:
pass
# catch unknown errors
except Exception as e:
self.logger.error("{0}: Thread for saving pasties crashed unexpectectly, recovering...: {1}".format(self.name, e))
self.logger.debug(traceback.format_exc())
finally:
# to be on the safe side of gf
del(pastie)
# signals to queue job is done
self.queue.task_done()
self.logger.info('{0}: Thread for saving pasties terminated'.format(self.name))
def save_pastie(self, pastie, timeout):
try:
self.logger.debug('{0}: queueing pastie {1} for saving'.format(self.name, pastie.url))
self.queue.put(pastie, True, timeout)
except Full:
self.logger.error('{0}: unable to save pastie[{1}]: queue is full'.format(self.name, pastie.url))
class StorageDispatcher():
"""Dispatcher will then take care of dispatching onions to the right databases.
Each database thread will read in the task and will handle it."""
def __init__(self, logger):
self.logger = logger
self.__storage = []
self.lock = threading.Lock()
def add_storage(self, thread_storage):
self.__storage.append(thread_storage)
def save_pastie(self, pastie, timeout=5):
self.logger.debug('Saving to database')
for t in self.__storage:
t.save_pastie(pastie, timeout)
class PastieStorage():
def __init__(self, **kwargs):
self.lookup = kwargs.get('lookup', False)
self.name = kwargs.get('name', self.__class__.__name__)
try:
self.logger.debug('{0}: initializing storage backend'.format(self.name))
self.__init_storage__(**kwargs)
except Exception as e:
self.logger.error('{0}: unable to initialize storage backend: {1}'.format(self.name, e))
raise
def format_directory(self, directory):
d = datetime.now()
year = str(d.year)
month = str(d.month)
# prefix month and day with "0" if it is only one digit
if len(month) < 2:
month = "0" + month
day = str(d.day)
if len(day) < 2:
day = "0" + day
return directory + os.sep + year + os.sep + month + os.sep + day
def __init_storage__(self, **kwargs):
raise NotImplementedError
def __save_pastie__(self, pastie):
raise NotImplementedError
def save_pastie(self, pastie):
try:
start = time.time()
self.logger.debug('{0}: saving pastie[{1}]'.format(self.name, pastie.url))
self.__save_pastie__(pastie)
delta = time.time() - start
self.logger.debug('{0}: pastie[{1}] saved in {2}s'.format(self.name, pastie.url, delta))
except Exception as e:
self.logger.error('{0}: unable to save pastie[{1}]: {2}'.format(self.name, pastie.url, e))
raise
def __seen_pastie__(self, pastie_id, **kwargs):
raise NotImplementedError
def seen_pastie(self, pastie_id, **kwargs):
if not self.lookup:
return False
try:
start = time.time()
self.logger.debug('{0}: looking up pastie[{1}]'.format(self.name, pastie_id))
res = self.__seen_pastie__(pastie_id, **kwargs)
delta = time.time() - start
self.logger.debug('{0}: pastie[{1}] looked-up in {2}s'.format(self.name, pastie_id, delta))
return res
except Exception as e:
self.logger.error('{0}: unable to lookup pastie[{1}]: {2}'.format(self.name, pastie_id, e))
raise
class Notifier(object):
def __init__(self, logger, **kwargs):
self.logger = logger
def send(self):
raise NotImplementedError()
def scheduledEvery(self, time="10:30"):
self.logger.info(f'Scheduled task everyday as {time}')
schedule.every().day.at(time).do(self.send)
schedule.run_pending()

@ -3,11 +3,14 @@ import traceback
from elasticsearch import Elasticsearch, helpers
class DbHandlerElasticSearch:
def __init__(self, config, logger):
from onioningestor.databases import PastieStorage
class Plugin(PastieStorage):
def __init__(self, logger, **kwargs):
self.name = kwargs.get('name')
self.logger = logger
self.logger.info('Creating Elasticsearch mapping')
self.config = config
self.config = kwargs
self.mapping = """
{
"mappings": {
@ -74,16 +77,7 @@ class DbHandlerElasticSearch:
else:
self.logger.error(status)
def update(self, _id, data):
if _id and data:
self.es.update(
index=self.index,
id=_id,
body={"doc":data})
self.count()
def save(self, data):
if data:
status = self.es.index(index=self.index,body=data)
def __save_pastie__(self, onion):
if onion:
status = self.es.index(index=self.index,body=onion.asdict())
self.count()
return status

@ -0,0 +1,33 @@
import sys
import requests
from onioningestor.databases import PastieStorage
class Plugin(PastieStorage):
def __init__(self, logger, **kwargs):
# kwargs = {'name': 'telegram-notifer', 'chat_id': 111111, 'token': 'XXXX'}
self.name = kwargs.get('name')
self.logger = logger
self.token = kwargs.get('token')
self.chat_id = kwargs.get('chat_id')
def __save_pastie__(self, pastie):
message = '''
HiddenSite: {site}
Source : {url}
Monitor : {content}
Status : {status}
'''.format(
site=pastie.url,
url=pastie.source,
content=pastie.monitor,
status=pastie.status)
url = 'https://api.telegram.org/bot{0}/sendMessage'.format(self.token)
try:
self.logger.debug('Sending message to telegram {} for pastie_id {}'.format(url, pastie))
requests.post(url, data={'chat_id': self.chat_id, 'text':message})
except Exception as e:
self.logger.warning("Failed to alert through telegram: {0}".format(e))

@ -1,8 +0,0 @@
class notifier(object):
def __init__(self):
pass
def scheduledEvery(self, time, summary):
pass

@ -0,0 +1,42 @@
from datetime import datetime as dt
class Onion(object):
"""Onion class"""
def __init__(self, url, source, type, status, monitor, denylist):
self.url = url
self.source = source
self.type = type
self.status = status
self.monitor = monitor
self.denylist = denylist
self.datetime = dt.now()
def simpleHTML(self, response):
self.simpleHTML = response
# if any match update denylist
def onionscan(self, response):
self.onionscan = response
def asdict(self):
d = {
'hiddenService':self.url,
'source':self.source,
'type':self.type,
'status':self.status,
'monitor': self.monitor,
'denylist': self.denylist,
'dateFound': self.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")+"Z",
'simpleHTML': self.simpleHTML,
'onionscan':self.onionscan
}
return d
def __lt__(self, other):
return self.datetime < other.datetime
def __str__(self):
return self.url
def __repr__(self):
return self.url

@ -1,10 +1,13 @@
import re
import sys
import json
from queue import Queue
from itertools import islice
from datetime import datetime as dt
from concurrent.futures import ThreadPoolExecutor
from collections import namedtuple
class Operator:
"""Base class for all Operator plugins.
@ -20,7 +23,7 @@ class Operator:
override other existing methods from this class.
"""
def __init__(self, logger, elasticsearch, allowed_sources=None):
def __init__(self, logger, allowed_sources=None):
"""Override this constructor in child classes.
The arguments above (artifact_types, filter_string, allowed_sources)
@ -44,10 +47,21 @@ class Operator:
classes. Remember to do so *before* setting any default artifact_types.
"""
self.logger = logger
self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE)
self.es = elasticsearch
self.processQueue = Queue()
self.onions = {}
self.onion = namedtuple('onion',['url','source','type','index','monitor','denylist'])
deny = allowed_sources or []
self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE)
def handle_onion(self, url):
"""Override with the same signature.
def response(self, content, onion, operator_name):
:param artifact: A single ``Artifact`` object.
:returns: None (always ignored)
"""
raise NotImplementedError()
def response(self, status, content):
"""
status: success/failure
content: dict
@ -55,50 +69,24 @@ class Operator:
return: dict
"""
try:
return {operator_name: content, 'hiddenService': onion}
#except TypeError:
# return {operator_name: None, 'hiddenService': onion}
return {'status':status, 'content': content}
except Exception as e:
self.logger.error(e)
def handle_onion(self, db, url):
"""Override with the same signature.
:param artifact: A single ``Artifact`` object.
:returns: None (always ignored)
"""
raise NotImplementedError()
def _onion_is_allowed(self, response, db, type='URL'):
def _onion_is_allowed(self, content, onion):
"""Returns True if this is allowed by this plugin's filters."""
# Must be in allowed_sources, if set.
if type == 'URL':
blacklist = self.blacklist.findall(response['hiddenService'])
elif type == 'HTML':
blacklist = self.blacklist.findall(response['simple-html']['HTML'])
blacklist = self.blacklist.findall(content)
if blacklist:
response['simple-html'].pop('status')
response['simple-html']['status'] = 'blocked'
response['blacklist'] = list(set(blacklist))
self.es.update(db['_id'], response)
return False
return True
onion.denylist = blacklist
def collect(self, onions):
for onion in onions:
self.logger.info(f'thread function processing {onion}')
# Add link to database
db = self.es.save({
'hiddenService':onion.url,
'monitor':'false',
'dateAdded':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'})
if self._onion_is_allowed(
self.response({'status':'blocked'},onion.url,'regex-blacklist'),
db,
type='URL'):
# Get data for current link
self.handle_onion(db, onion.url)
if onion.monitor:
self.handle_onion(onion)
else:
if self._onion_is_allowed(onion.url):
self.handle_onion(onion)
def iter_batches(self, data, batch_size):
data = iter(data)
@ -110,8 +98,11 @@ class Operator:
def process(self, onions):
"""Process all applicable onions."""
#print(onions)
with ThreadPoolExecutor(max_workers=10) as executor:
collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
for tasks in collect_tasks:
self.logger.info(tasks.result())
for onion in onions:
self.handle_onion(onion[1])
#self.save_pastie()
#with ThreadPoolExecutor(max_workers=1) as executor:
# collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
# for tasks in collect_tasks:
# self.logger.info(tasks.result())

@ -4,6 +4,7 @@ import json
import traceback
from datetime import datetime as dt
from json.decoder import JSONDecodeError
from collections import Counter
import requests
@ -22,10 +23,10 @@ class Plugin(Operator):
This plugin collects HTML code from onion link
"""
def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
self.plugin_name = "simple-html"
self.logger.info(f"Initializing {self.plugin_name}")
def __init__(self, logger, **kwargs):
super(Plugin, self).__init__(logger)
self.name = kwargs['name']
self.logger.info(f"Initializing {self.name}")
self.timeout = int(kwargs["timeout"])
self.retries = int(kwargs["retries"])
@ -83,42 +84,24 @@ class Plugin(Operator):
result = content.text
if result:
html = BeautifulSoup(result, features="lxml")
# testing hardcorded filepath
with open(
"/home/tony/Projects/OnionScraper_v2/onion_master_list.txt",
"w",
) as fp:
for onion in re.findall("([a-z2-7]{16,56}\.onion)", result):
fp.write("%s\n" % onion)
if html:
index = {
"HTML": result,
"title": html.title.text,
"language": detect(html.text),
"date-crawled": dt.utcnow().strftime(
"%Y-%m-%dT%H:%M:%S.%f"
)
+ "Z",
"status": "success",
"interestingKeywords": list(
set(self.interesting.findall(result))
),
"interestingKeywords": Counter(self.interesting.findall(result)),
}
else:
index = {
"HTML": result,
"title": None,
"language": None,
"date-crawled": dt.utcnow().strftime(
"%Y-%m-%dT%H:%M:%S.%f"
)
+ "Z",
"status": "success",
"interestingKeywords": list(
set(self.interesting.findall(result))
),
"interestingKeywords": Counter(self.interesting.findall(result)),
}
return self.response(index, onion, self.plugin_name)
return self.response("success", index)
except requests.exceptions.ConnectionError as connection_error:
self.logger.error(f"Failed connecting to http://{url}")
self.logger.debug(connection_error)
@ -131,10 +114,10 @@ class Plugin(Operator):
self.renew_connection()
if retry > self.retries:
self.logger.error("[x] Max retries exceeded")
return self.response({"status": "failure"}, onion, self.plugin_name)
return self.response("failed", None)
def handle_onion(self, db, onion):
content = self.run_sessions(onion)
if content[self.plugin_name]["status"] == "success":
if self._onion_is_allowed(content, db, "HTML"):
self.es.update(db["_id"], content)
def handle_onion(self, onion):
html = self.run_sessions(onion.url)
if html['status'] == 'success':
self._onion_is_allowed(html['content']['HTML'], onion)
onion.simpleHTML(html)

@ -1,28 +1,14 @@
import re
import os
import sys
import json
import time
import random
import traceback
import subprocess
from uuid import uuid4
from pathlib import Path
from datetime import datetime as dt
from threading import Timer
from json.decoder import JSONDecodeError
from concurrent.futures import ProcessPoolExecutor
from threading import Timer
import requests
from stem.control import Controller
from stem import Signal
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from onionscraper.operators import Operator
from onioningestor.operators import Operator
class Plugin(Operator):
@ -32,41 +18,13 @@ class Plugin(Operator):
sending artifacts to operators.
"""
def __init__(self, logger, **kwargs):
self.name = kwargs['name']
self.logger = logger
self.logger.info('Initializing OnionScanner')
screenshots = kwargs.pop('screenshots_path', None)
if screenshots:
self.screenshots = Path(screenshots)
else:
self.screenshots = Path(__file__).parents[1]/'screenshots'
self.logger.info(f'Initializing {self.name}')
self.onionscan = kwargs['binpath']
self.timeout = int(kwargs['timeout'])
self.proxy = kwargs['socks5']
self.torControl = kwargs['TorController']
self.retries = int(kwargs['retries'])
self.headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'DNT': '1', 'Connection':
'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
blacklist = kwargs['blacklist'].split(',')
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
keywords = kwargs['interestingKeywords'].split(',')
self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
self.session = self.get_tor_session()
def response(self, status, content, onion):
"""
status: success/failure
content: dict
onion: str
return: dict
"""
return {'status': status, 'data': content, 'onion': onion}
self.timeout = int(kwargs.get('timeout', 300))
self.torControl = 9051
self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T"
def parseDoc(self, data):
data['onionscan'].pop('simpleReport', None)
@ -75,74 +33,17 @@ class Plugin(Operator):
data['onionscan']['crawls'] = [*crawls]
data['hiddenService'] = hiddenService
for onion in crawls.keys():
print(onion)
with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
fp.write("%s\n" % onion)
#q.enqueue(self.crawl, onion)
#with open('test.json', 'w', encoding='utf-8') as f:
# json.dump(data, f, ensure_ascii=False, indent=4)
self.queueCrawl((
3,
self.onion(
url=onion,
source='crawled',
type='domain',
status='offline',
monitor=False,
denylist=False)))
return data
def format_directory(self, directory):
d = dt.now()
year = str(d.year)
month = str(d.month)
# prefix month and day with "0" if it is only one digit
if len(month) < 2:
month = "0" + month
day = str(d.day)
if len(day) < 2:
day = "0" + day
save_path = directory/year/month/day
if not os.path.isdir(save_path):
self.logger.info("[*] Creating directory to save screenshots")
os.makedirs(save_path)
return save_path
def take_screenshot(self, save_path, onion):
binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
fp = webdriver.FirefoxProfile()
fp.set_preference('network.proxy.type', 1)
fp.set_preference('network.proxy.socks', '127.0.0.1')
fp.set_preference('network.proxy.socks_port', 9050)
fp.set_preference('network.proxy.socks_remote_dns', True)
options = Options()
options.headless = True
driver = webdriver.Firefox(
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
options=options,
firefox_profile=fp
)
url = 'http://' + onion
driver.get(url)
uid = str(uuid4()).split('-')[0]
filename = f"{onion}_screenshot_{uid}.png"
f_name = f"{save_path}/{filename}"
driver.save_screenshot(f_name)
driver.quit()
if os.path.isfile(f_name):
self.logger.info(f'[*] Screenshot was taken. {f_name}')
dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
return self.response("success",result,onion)
else:
self.logger.error('[x] Unable to take screenshot')
return self.response("failure",None,onion)
def get_tor_session(self):
try:
s = requests.session()
s.proxies = self.proxy
s.headers.update(self.headers)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
return s
# signal TOR for a new connection
def renew_connection(self):
with Controller.from_port(port = self.torControl['port']) as controller:
@ -154,8 +55,7 @@ class Plugin(Operator):
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
session = self.get_tor_session()
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
self.logger.info(f"IP is {requests.get('http://httpbin.org/ip').json()['origin']}")
def handle_timeout(self, process, onion):
#
@ -171,33 +71,6 @@ class Plugin(Operator):
self.renew_connection()
return
def run_sessions(self, onion):
retry = 0
result = None
while True:
try:
url = 'http://'+onion
self.logger.info(url)
content = self.session.get(url)
if content.status_code == 200:
result = content.json()
except JSONDecodeError as e:
self.logger.debug(f'JSONDecodeError {e}')
result = content.text
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
finally:
if result:
return self.response("success",result,onion)
else:
self.logger.info('[x] No results found retrying ...')
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error('[x] Max retries exceeded')
return self.response("failure",None, onion)
def run_onionscan(self, onion):
self.logger.info("[*] Running onionscan on %s", onion)
@ -205,7 +78,7 @@ class Plugin(Operator):
process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
# start the timer and let it run till timeout minutes
process_timer = Timer(300,self.handle_timeout,args=[process,onion])
process_timer = Timer(self.timeout,self.handle_timeout,args=[process,onion])
process_timer.start()
# wait for the onion scan results
@ -215,47 +88,23 @@ class Plugin(Operator):
if process_timer.is_alive():
process_timer.cancel()
try:
return self.response("success",json.loads(stdout),onion)
return self.response(
"success",
self.parseDoc(json.loads(stdout)))
except json.decoder.JSONDecodeError:
pass
return self.response(
"success",
self.parseDoc(stdout))
self.logger.info("[!!!] Process timed out for %s", onion)
return self.response("failed",stdout)
return self.response("failure",None, onion)
def handle_onion(self, onion_tuple):
onion = onion_tuple.url
self.logger.info(f'Processing {onion} with onionscan')
def handle_onion(self, onion):
try:
blacklist_URL = self.blacklist.search(onion)
if blacklist_URL:
self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
else:
self.logger.debug("[*] URL blacklist test: PASSED")
results = self.run_onionscan(onion)
if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
content = self.run_sessions(onion)
if content['status'] == 'success':
blacklist_CONTENT = self.blacklist.search(content['data'])
if blacklist_CONTENT:
self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
else:
self.logger.debug("[*] CONTENT blacklist test: PASSED")
screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
self.logger.info("Indexing!")
doc = {
'onionscan':json.loads(results['data']),
'html':content['data'],
'screenshots':screenshot['data'],
'interestingKeywords':self.interestingKeywords.findall(content['data'])
}
return self.parseDoc(doc)
else:
self.logger.info(f"[x] hidden service {onion} is not active")
results = self.run_onionscan(onion.url)
onion.onionscan(results)
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
finally:
pass
#sys.exit(0)

@ -0,0 +1,261 @@
import re
import os
import sys
import json
import time
import random
import traceback
import subprocess
from uuid import uuid4
from pathlib import Path
from datetime import datetime as dt
from json.decoder import JSONDecodeError
from concurrent.futures import ProcessPoolExecutor
from threading import Timer
import requests
from stem.control import Controller
from stem import Signal
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from onioningestor.operators import Operator
class Plugin(Operator):
"""OnionScraper main work logic.
Handles reading the config file, calling sources, maintaining state and
sending artifacts to operators.
"""
def __init__(self, logger, **kwargs):
self.logger = logger
self.logger.info('Initializing OnionScanner')
screenshots = kwargs.pop('screenshots_path', None)
if screenshots:
self.screenshots = Path(screenshots)
else:
self.screenshots = Path(__file__).parents[1]/'screenshots'
self.onionscan = kwargs['binpath']
self.timeout = int(kwargs['timeout'])
self.proxy = kwargs['socks5']
self.torControl = kwargs['TorController']
self.retries = int(kwargs['retries'])
self.headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'DNT': '1', 'Connection':
'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
blacklist = kwargs['blacklist'].split(',')
self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
keywords = kwargs['interestingKeywords'].split(',')
self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
self.session = self.get_tor_session()
def response(self, status, content, onion):
"""
status: success/failure
content: dict
onion: str
return: dict
"""
return {'status': status, 'data': content, 'onion': onion}
def parseDoc(self, data):
data['onionscan'].pop('simpleReport', None)
crawls = data['onionscan'].pop('crawls', None)
hiddenService = data['onionscan'].pop('hiddenService', None)
data['onionscan']['crawls'] = [*crawls]
data['hiddenService'] = hiddenService
for onion in crawls.keys():
print(onion)
with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
fp.write("%s\n" % onion)
#q.enqueue(self.crawl, onion)
#with open('test.json', 'w', encoding='utf-8') as f:
# json.dump(data, f, ensure_ascii=False, indent=4)
return data
def format_directory(self, directory):
d = dt.now()
year = str(d.year)
month = str(d.month)
# prefix month and day with "0" if it is only one digit
if len(month) < 2:
month = "0" + month
day = str(d.day)
if len(day) < 2:
day = "0" + day
save_path = directory/year/month/day
if not os.path.isdir(save_path):
self.logger.info("[*] Creating directory to save screenshots")
os.makedirs(save_path)
return save_path
def take_screenshot(self, save_path, onion):
binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
fp = webdriver.FirefoxProfile()
fp.set_preference('network.proxy.type', 1)
fp.set_preference('network.proxy.socks', '127.0.0.1')
fp.set_preference('network.proxy.socks_port', 9050)
fp.set_preference('network.proxy.socks_remote_dns', True)
options = Options()
options.headless = True
driver = webdriver.Firefox(
executable_path='/home/tony/Projects/OnionScraper/geckodriver',
options=options,
firefox_profile=fp
)
url = 'http://' + onion
driver.get(url)
uid = str(uuid4()).split('-')[0]
filename = f"{onion}_screenshot_{uid}.png"
f_name = f"{save_path}/{filename}"
driver.save_screenshot(f_name)
driver.quit()
if os.path.isfile(f_name):
self.logger.info(f'[*] Screenshot was taken. {f_name}')
dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
return self.response("success",result,onion)
else:
self.logger.error('[x] Unable to take screenshot')
return self.response("failure",None,onion)
def get_tor_session(self):
try:
s = requests.session()
s.proxies = self.proxy
s.headers.update(self.headers)
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
return s
# signal TOR for a new connection
def renew_connection(self):
with Controller.from_port(port = self.torControl['port']) as controller:
# Now we switch TOR identities to make sure we have a good connection
self.logger.info('Getting new Tor IP')
# authenticate to our local TOR controller
controller.authenticate(self.torControl['password'])
# send the signal for a new identity
controller.signal(Signal.NEWNYM)
# wait for the new identity to be initialized
time.sleep(controller.get_newnym_wait())
session = self.get_tor_session()
self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
def handle_timeout(self, process, onion):
#
# Handle a timeout from the onionscan process.
#
try:
# kill the onionscan process
process.kill()
self.logger.info("[!!!] Killed the onionscan process.")
except:
pass
self.renew_connection()
return
def run_sessions(self, onion):
retry = 0
result = None
while True:
try:
url = 'http://'+onion
self.logger.info(url)
content = self.session.get(url)
if content.status_code == 200:
result = content.json()
except JSONDecodeError as e:
self.logger.debug(f'JSONDecodeError {e}')
result = content.text
except Exception as e:
self.logger.error(e)
self.logger.debug(traceback.print_exc())
finally:
if result:
return self.response("success",result,onion)
else:
self.logger.info('[x] No results found retrying ...')
retry += 1
self.renew_connection()
if retry > self.retries:
self.logger.error('[x] Max retries exceeded')
return self.response("failure",None, onion)
def run_onionscan(self, onion):
self.logger.info("[*] Running onionscan on %s", onion)
# fire up onionscan
process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
# start the timer and let it run till timeout minutes
process_timer = Timer(300,self.handle_timeout,args=[process,onion])
process_timer.start()
# wait for the onion scan results
stdout = process.communicate()[0]
# we have received valid results so we can kill the timer
if process_timer.is_alive():
process_timer.cancel()
try:
return self.response("success",json.loads(stdout),onion)
except json.decoder.JSONDecodeError:
pass
self.logger.info("[!!!] Process timed out for %s", onion)
return self.response("failure",None, onion)
def handle_onion(self, onion_tuple):
onion = onion_tuple.url
self.logger.info(f'Processing {onion} with onionscan')
try:
blacklist_URL = self.blacklist.search(onion)
if blacklist_URL:
self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
else:
self.logger.debug("[*] URL blacklist test: PASSED")
results = self.run_onionscan(onion)
if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
content = self.run_sessions(onion)
if content['status'] == 'success':
blacklist_CONTENT = self.blacklist.search(content['data'])
if blacklist_CONTENT:
self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
else:
self.logger.debug("[*] CONTENT blacklist test: PASSED")
screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
self.logger.info("Indexing!")
doc = {
'onionscan':json.loads(results['data']),
'html':content['data'],
'screenshots':screenshot['data'],
'interestingKeywords':self.interestingKeywords.findall(content['data'])
}
return self.parseDoc(doc)
else:
self.logger.info(f"[x] hidden service {onion} is not active")
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
finally:
pass
#sys.exit(0)

@ -1,5 +1,5 @@
from onionscraper.operators import Operator
from onioningestor.operators import Operator
class Plugin(Operator):

@ -1,4 +1,5 @@
from collections import namedtuple
from onioningestor.onion import Onion
class Source(object):
"""Base class for all Source plugins.
@ -7,14 +8,16 @@ class Source(object):
additional methods to child classes, consider prefixing the method name
with an underscore to denote a ``_private_method``.
"""
def __init__(self, name, *args, **kwargs):
def __init__(self, *args, **kwargs):
"""Override this constructor in child classes.
The first argument must always be ``name``.
Other argumentss should be url, auth, etc, whatever is needed to set
up the object.
"""
self.onion = namedtuple('onion', ['url','source','type'])
self.onion = Onion
def set_onionQueue(self, queue):
self.onionQueue = queue
def run(self):
"""Run and return ``(saved_state, list(Artifact))``.
@ -28,14 +31,3 @@ class Source(object):
"""
raise NotImplementedError()
def process_element(self, content, reference_link, include_nonobfuscated=False):
"""Take a single source content/url and return a list of Artifacts.
This is the main work block of Source plugins, which handles
IOC extraction and artifact creation.
:param content: String content to extract from.
:param reference_link: Reference link to attach to all artifacts.
:param include_nonobfuscated: Include non-defanged URLs in output?
"""
logger.debug(f"Processing in source '{self.name}'")

@ -17,7 +17,7 @@ import time
from bs4 import BeautifulSoup
from onionscraper.sources import Source
from onioningestor.sources import Source
class Plugin(Source):

@ -17,7 +17,7 @@ import time
from bs4 import BeautifulSoup
from onionscraper.sources import Source
from onioningestor.sources import Source
class Plugin(Source):

@ -16,7 +16,7 @@ from onioningestor.sources import Source
class Plugin(Source):
def __init__(self, logger, name, filename):
def __init__(self, logger, name, filename, **kwargs):
self.logger = logger
self.name = name
self.filename = filename
@ -28,10 +28,17 @@ class Plugin(Source):
filepath = Path(__file__).parents[2]/self.filename
with open(filepath, 'r') as fp:
lines = fp.read().splitlines()
# just testing
os.remove(self.filename)
for onion in lines:
items.append(self.onion(url=onion,source='simple-file',type='domain'))
#yield self.onion(url=onion,source='simple-file',type='domain')
return items
self.onionQueue.put(
(
2,
self.onion(
url=onion,
source=self.name,
type='domain',
status='offline',
monitor=False,
denylist=False)
)
)

@ -4,11 +4,14 @@ chardet==3.0.4
click==7.1.2
elasticsearch==7.8.0
idna==2.10
langdetect==1.0.8
lxml==4.5.1
PySocks==1.7.1
PyYAML==5.3.1
requests==2.24.0
schedule==0.6.0
selenium==3.141.0
six==1.15.0
soupsieve==2.0.1
stem==1.8.0
urllib3==1.25.9
Loading…
Cancel
Save