diff --git a/.gitignore b/.gitignore index c252f26..930836a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +monitoring.* onion_master_list.* webui templates diff --git a/example.yml b/example.yml deleted file mode 100644 index f53a634..0000000 --- a/example.yml +++ /dev/null @@ -1,86 +0,0 @@ -# This is an example ThreatIngestor config file with some preconfigured RSS -# sources, feeding extracted artifacts into a CSV file. - -general: - # Run forever, check feeds once an hour. - daemon: True - sleep: 10 - onion_validation: ([a-z2-7]{16,56}\.onion) - blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder,cocks,year,old - interestingKeywords: t.me,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach,bank,logs,hack - elasticsearch: - index: darkweb - port : 9200 - host : 127.0.0.1 - -sources: - # A few threat intel blogs to get you started! - - name: simple-text-file - module: simplefile - filename: onion_master_list.txt - - # - name: source-gist - # module: gist - # url: https://gist.github.com/search?l=Text&q=.onion - - # - name: source-reddit - # module: reddit - # url: https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000 - # feed_type: messy - # - # - name: pastebin - # module: pastebin-account - # url: https://gist.github.com/search?l=Text&q=.onion - # feed_type: messy - # - # - name: hunchly-report - # module: gmail-hunchly - # url: https://gist.github.com/search?l=Text&q=.onion - # feed_type: messy - # - # - name: onionland-search - # module: collect-onions - # url: http://3bbaaaccczcbdddz.onion/discover - # feed_type: messy - # - # - name: torch - # module: collect-onions - # url: http://xmh57jrzrnw6insl.onion - # feed_type: messy - - -operators: - - name: simple-html - module: html - socks5: - http: 'socks5h://127.0.0.1:9050' - https: 'socks5h://127.0.0.1:9050' - TorController: - port: 9051 - password: your-torcontroller-password-here - - - name: simple-screenshot - module: screenshot - screenshots_path: null - - - name: onionscan-go - module: onionscan - binpath: /home/tony/go/bin/onionscan - - - # - name: yara-rule - # module: yara - # filename: categories.yar - # base_score: 50 - # - # - name: regex-match - # module: regex - # keywords: test,test2 - # base_score: 20 - -notifiers: - # Simple telegram notifier - - name: telegram-notifer - module: telegram - chat_id: - token: diff --git a/onioningestor.yml b/onioningestor.yml new file mode 100644 index 0000000..8ecdedc --- /dev/null +++ b/onioningestor.yml @@ -0,0 +1,106 @@ +# This is an example OnionIngestor config file with some preconfigured configurations +# Storage Engines elasticsearch and telegram are configured + +general: + # Run forever, check feeds once an hour. + daemon: True + sleep: 10 + onion_validation: ([a-z2-7]{16,56}\.onion) + blacklist: pedo,porn,child + interestingKeywords: your,keywords,here + save-thread: no # Use a separate thread to save onions + +monitor: + filename: monitoring.txt + +sources: + # A few threat intel blogs to get you started! + - name: simple-text-file + module: simplefile + filename: onion_master_list.txt + + # - name: source-gist + # module: gist + # url: https://gist.github.com/search?l=Text&q=.onion + + # - name: source-reddit + # module: reddit + # url: https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000 + # feed_type: messy + # + # - name: pastebin + # module: pastebin-account + # url: https://gist.github.com/search?l=Text&q=.onion + # feed_type: messy + # + # - name: hunchly-report + # module: gmail-hunchly + # url: https://gist.github.com/search?l=Text&q=.onion + # feed_type: messy + # + # - name: onionland-search + # module: collect-onions + # url: http://3bbaaaccczcbdddz.onion/discover + # feed_type: messy + # + # - name: torch + # module: collect-onions + # url: http://xmh57jrzrnw6insl.onion + # feed_type: messy + + +operators: + - name: simple-html + module: html + timeout: 300 + retries: 2 + interestingKeywords: your,keywords,here + socks5: + http: 'socks5h://127.0.0.1:9050' + https: 'socks5h://127.0.0.1:9050' + TorController: + port: 9051 + password: your-tor-controller-password + + - name: onionscan-go + module: onionscan + binpath: your-onionscan-binary-path + + - name: simple-screenshot + module: screenshot + screenshots_path: null + + # - name: yara-rule + # module: yara + # filename: categories.yar + # base_score: 50 + # + # - name: regex-match + # module: regex + # keywords: test,test2 + # base_score: 20 + +database_Engines: + - name: telegram-notifer #Simple Telegram notifier + module: telegram + chat_id: your-telegram-chat-id + token: your-telegram-token + + - name: elasticsearch + module: elasticsearch + index: your-index-name + port : 9200 + host : 127.0.0.1 + + # - name: email + # module: send_email + # alert: no # Enable/disable email alerts + # from: alert@example.com + # to: alert@example.com + # server: 127.0.0.1 # Address of the server (hostname or IP) + # port: 25 # Outgoing SMTP port: 25, 587, ... + # tls: no # Enable/disable tls support + # username: '' # (optional) Username for authentication. Leave blank for no authentication. + # password: '' # (optional) Password for authentication. Leave blank for no authentication. + # subject: '[onioningestor] - {subject}' + # size-limit: 1048576 # Size limit for pastie, above it's sent as attachement diff --git a/onioningestor/__init__.py b/onioningestor/__init__.py index fbee85b..1d9565b 100644 --- a/onioningestor/__init__.py +++ b/onioningestor/__init__.py @@ -1,12 +1,15 @@ import sys import time +import queue import traceback +import threading import collections +from queue import Queue from . import config -from . import dbhandler from . import loghandler +from onioningestor.databases import StorageDispatcher, StorageThread, StorageSync class Ingestor: """ThreatIngestor main work logic. @@ -18,37 +21,51 @@ class Ingestor: # Load logger log = loghandler.LoggerHandler(args.logLevel) self.logger = log.start_logging() + # Load config self.config = config.Config(args.configFile, self.logger) self.blacklist = self.config.blacklist() - # Load Elasticsearch. - try: - self.es = dbhandler.DbHandlerElasticSearch( - self.config.elasticsearch(), - self.logger) - except Exception as e: - # Error loading elasticsearch. - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - sys.exit(1) + # Create Queues + self.queue = self.config.monitorQueue() + # Get asynchronously o synchronously save + self.save_thread = self.config.save_thread() - # Instantiate plugins. - try: - self.logger.info("Initializing sources") - self.sources = {name: source(self.logger, **kwargs) - for name, source, kwargs in self.config.sources()} + # Track some statistics about artifacts in a summary object. + self.summary = collections.Counter() - self.logger.info("initializing operators") - self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs) + # Threads + self.threads = [] + try: + # Load Storage Engines - ElasticSearch, Telegram, Twitter etc + self.storage = StorageDispatcher(self.logger) + + for name, db, kwargs in self.config.database_engines(): + # start the threads handling database storage if needed + if self.save_thread: + self.logger.debug(f"Starting daemon thread for {str(db)}") + t = StorageThread(db(self.logger, **kwargs)) + self.threads.append(t) + t.setDaemon(True) + t.start() + # save onions synchronously + else: + s = StorageSync(db(self.logger, **kwargs)) + self.storage.add_storage(s) + + if self.save_thread: + self.logger.info("Onions will be saved asynchronously") + else: + self.logger.info("Onions will be saved synchronously") + + # Instantiate operator plugins. + self.logger.debug("initializing operators") + self.operators = {name: operator(self.logger, **kwargs) for name, operator, kwargs in self.config.operators()} - self.logger.info("initializing notifiers") - #self.notifiers = {name: operator(**kwargs) - # for name, operator, kwargs in self.config.notifiers()} except Exception as e: - # Error loading elasticsearch. + # Error loading starting plugins. self.logger.error(e) self.logger.debug(traceback.print_exc()) sys.exit(1) @@ -56,51 +73,54 @@ class Ingestor: def run(self): """Run once, or forever, depending on config.""" - if self.config.daemon(): - self.logger.info("Running forever, in a loop") - self.run_forever() - else: - self.logger.info("Running once, to completion") - self.run_once() + self.run_once() + #if self.config.daemon(): + # self.logger.info("Running forever, in a loop") + # self.run_forever() + #else: + # self.logger.info("Running once, to completion") + # self.run_once() def run_once(self): """Run each source once, passing artifacts to each operator.""" - # Track some statistics about artifacts in a summary object. - summary = collections.Counter() - - for source in self.sources: - # Run the source to collect artifacts. - self.logger.info(f"Running source '{source}'") - try: - # get the generator of onions - onions = self.sources[source].run() - except Exception as e: - self.logger.error(e) - self.logger.error(traceback.print_exc()) - continue - - # Process onions with each operator. - for operator in self.operators: - self.logger.info(f"Processing found onions with operator '{operator}'") + # Start collecting sources + self.collect_sources() + + # Sources will fill various queues + # MonitorQueue has priority high + # OnionQueue are those found in clearnet medium + # crawlQueue are those found crawling onionlinks low + onions = list(self.queue.queue) + done = False + if onions: + while not done: try: - self.operators[operator].process(onions) - # Save the source onion with collected data + ## Process onions with each operator. + for operator in self.operators: + self.logger.info(f"Processing found onions with operator '{operator}'") + # Process list of onions + self.operators[operator].process(onions) + done = True + ## Save Onions for each storage + for onion in onions: + self.storage.save_pastie(onion[1], 30) except Exception as e: self.logger.error(e) self.logger.error(traceback.print_exc()) - continue - - - -# # Record stats and update the summary. -# types = artifact_types(doc.get('interestingKeywords')) -# summary.update(types) -# for artifact_type in types: -# self.logger.info(f'types[artifact_type]') - - # Log the summary. - self.logger.info(f"New artifacts: {dict(summary)}") + break + except KeyboardInterrupt: + print('') + self.logger.info("Ctrl-c received! Sending kill to threads...") + for t in self.threads: + t.kill_received = True + self.logger.info('Exiting') + sys.exit(0) + else: + for t in self.threads: + t.kill_received = True + self.logger.info(f"Sleeping for {self.config.sleep()} seconds") + time.sleep(self.config.sleep()) def run_forever(self): @@ -108,20 +128,22 @@ class Ingestor: while True: self.run_once() - self.logger.info(f"Sleeping for {self.config.sleep()} seconds") - time.sleep(self.config.sleep()) - - -def artifact_types(artifact_list): - """Return a dictionary with counts of each artifact type.""" - types = {} - for artifact in artifact_list: - artifact_type = artifact.__class__.__name__.lower() - if artifact_type in types: - types[artifact_type] += 1 - else: - types[artifact_type] = 1 - - return types + def collect_sources(self): + self.logger.debug("Initializing sources") + for name, collect, kwargs in self.config.sources(): + # Run the source to collect onion links from clear net. + self.logger.info(f"Running source '{name}'") + try: + # get the generator of onions + source = collect(self.logger, **kwargs) + source.set_onionQueue(self.queue) #priority 2 + t = source.run() + self.threads.append(t) + #self.logger.info(f'Starting of thread: {t.currentThread().name}') + #t.start() + except Exception as e: + self.logger.error(e) + self.logger.error(traceback.print_exc()) + continue diff --git a/onioningestor/__main__.py b/onioningestor/__main__.py index e00adb5..d2b4d6c 100644 --- a/onioningestor/__main__.py +++ b/onioningestor/__main__.py @@ -38,7 +38,7 @@ from onioningestor import Ingestor # Load arguments from user parser = argparse.ArgumentParser( - prog='onionscraper', + prog='onioningestor', description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file') parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO") diff --git a/onioningestor/config.py b/onioningestor/config.py index f7cd6cc..400257f 100644 --- a/onioningestor/config.py +++ b/onioningestor/config.py @@ -1,22 +1,18 @@ import io import importlib import traceback - +from queue import PriorityQueue +from collections import namedtuple import yaml from pathlib import Path +from onioningestor.onion import Onion SOURCE = "onioningestor.sources" OPERATOR = "onioningestor.operators" -INTERNAL_OPTIONS = [ - "saved_state", - "module", - "credentials", -] -ARTIFACT_TYPES = "artifact_types" -FILTER_STRING = "filter" -ALLOWED_SOURCES = "allowed_sources" +DATABASE_ENGINE = "onioningestor.databases" + NAME = "name" @@ -43,16 +39,15 @@ class Config: module = importlib.import_module(".".join([plugin_type, plugin])) return module.Plugin except Exception as e: - print(e) - print(traceback.print_exc()) + self.logger.error(e) + self.logger.debug(traceback.print_exc()) def daemon(self): """Returns boolean, are we daemonizing?""" return self.config["general"]["daemon"] - def elasticsearch(self): - """Returns elasticsaerch config""" - return self.config["general"]["elasticsearch"] + def save_thread(self): + return self.config["general"].get("save-thread", False) def sleep(self): """Returns number of seconds to sleep between iterations, if daemonizing.""" @@ -61,36 +56,53 @@ class Config: def blacklist(self): return self.config["general"]["blacklist"].split(",") - # def onionscanner(self): - # """Returns onionscanner config dict""" - # screenshots = self.config['onionscanner'].pop('screenshots_path', None) - # if screenshots: - # self.config['onionscanner']['screenshots_path'] = Path(screenshots) - # else: - # self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots' - # blacklist = self.config['onionscanner'].pop('blacklist', None) - # if blacklist: - # self.config['onionscanner']['blacklist'] = blacklist.split(',') - # interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None) - # if interestingKeywords: - # self.config['onionscanner']['interestingKeywords'] = blacklist.split(',') - # return self.config['onionscanner'] - - def notifiers(self): - """Returns notifiers config dictionary.""" - return self.config.get("notifiers", {}) + def monitorQueue(self): + fp = self.config["monitor"].get("filename", False) + q = PriorityQueue(maxsize=0) + if fp: + with open(fp, 'r') as f: + monitorOnions = f.read().splitlines() + for monitor in monitorOnions: + q.put(( + 1, + Onion( + url=monitor, + source='monitor', + type='domain', + status='offline', + monitor=True, + denylist=False))) + return q + else: + return None def logging(self): """Returns logging config dictionary.""" return self.config.get("logging", {}) - def credentials(self, credential_name): - """Return a dictionary with the specified credentials.""" - for credential in self.config["credentials"]: - for key, value in credential.items(): - if key == NAME and value == credential_name: - return credential - return {} + def database_engines(self): + """Return a list of (name, Source class, {kwargs}) tuples. + :raises: threatingestor.exceptions.PluginError + """ + engines = [] + + for engine in self.config["database_Engines"]: + kwargs = {} + for key, value in engine.items(): + kwargs[key] = value + # load and initialize the plugin + self.logger.debug(f"Found database engine '{engine[NAME]}'") + kwargs.pop('module',None) + engines.append( + ( + engine[NAME], + self._load_plugin(DATABASE_ENGINE, engine["module"]), + kwargs + ) + ) + + self.logger.debug(f"Found {len(engines)} total database engines") + return engines def sources(self): """Return a list of (name, Source class, {kwargs}) tuples. @@ -101,25 +113,19 @@ class Config: for source in self.config["sources"]: kwargs = {} for key, value in source.items(): - if key not in INTERNAL_OPTIONS: - kwargs[key] = value - - elif key == "credentials": - # Grab these named credentials - credential_name = value - for credential_key, credential_value in self.credentials( - credential_name - ).items(): - if credential_key != NAME: - kwargs[credential_key] = credential_value - + kwargs[key] = value # load and initialize the plugin - self.logger.info(f"Found source '{source[NAME]}'") + self.logger.debug(f"Found source '{source[NAME]}'") + kwargs.pop('module',None) sources.append( - (source[NAME], self._load_plugin(SOURCE, source["module"]), kwargs) + ( + source[NAME], + self._load_plugin(SOURCE, source["module"]), + kwargs + ) ) - self.logger.info(f"Found {len(sources)} total sources") + self.logger.debug(f"Found {len(sources)} total sources") return sources def operators(self): @@ -130,44 +136,10 @@ class Config: for operator in self.config["operators"]: kwargs = {} for key, value in operator.items(): - if key not in INTERNAL_OPTIONS: - if key == ARTIFACT_TYPES: - # parse out special artifact_types option - artifact_types = [] - for artifact in value: - try: - artifact_types.append( - threatingestor.artifacts.STRING_MAP[ - artifact.lower().strip() - ] - ) - except KeyError: - # ignore invalid artifact types - pass - kwargs[key] = artifact_types - - elif key == FILTER_STRING: - # pass in special filter_string option - kwargs["filter_string"] = value - - elif key == NAME: - # exclude name key from operator kwargs, since it's not used - pass - - else: - kwargs[key] = value - - elif key == "credentials": - # Grab these named credentials - credential_name = value - for credential_key, credential_value in self.credentials( - credential_name - ).items(): - if credential_key != NAME: - kwargs[credential_key] = credential_value - + kwargs[key] = value # load and initialize the plugin - self.logger.info(f"Found operator '{operator[NAME]}'") + self.logger.debug(f"Found operator '{operator[NAME]}'") + kwargs.pop('module',None) operators.append( ( operator[NAME], @@ -176,5 +148,5 @@ class Config: ) ) - self.logger.info(f"Found {len(operators)} total operators") + self.logger.debug(f"Found {len(operators)} total operators") return operators diff --git a/onioningestor/databases/__init__.py b/onioningestor/databases/__init__.py new file mode 100644 index 0000000..20f6098 --- /dev/null +++ b/onioningestor/databases/__init__.py @@ -0,0 +1,149 @@ +import sys +import time +import schedule +import threading + +class StorageScheduler(): + def __init__(self, storage, **kwargs): + self.storage = storage + self.name = self.storage.name + + def save_pastie(self, pastie, timeout): + raise NotImplementedError + + +class StorageSync(StorageScheduler): + ### synchronously save onions ### + def save_pastie(self, pastie, timeout): + self.storage.save_pastie(pastie) + + +# LATER: implement an async class +class StorageThread(threading.Thread, StorageScheduler): + def __init__(self, logger, storage, **kwargs): + threading.Thread.__init__(self) + StorageScheduler.__init__(self, storage, **kwargs) + self.logger = logger + try: + size = int(kwargs['queue_size']) + except Exception: + size = 0 + self.queue = Queue(size) + self.kill_received = False + + def run(self): + self.logger.info('{0}: Thread for saving pasties started'.format(self.name)) + # loop over the queue + while not self.kill_received: + # pastie = None + try: + # grabs pastie from queue + pastie = self.queue.get(True, 5) + # save the pasties in each storage + self.storage.save_pastie(pastie) + except Empty: + pass + # catch unknown errors + except Exception as e: + self.logger.error("{0}: Thread for saving pasties crashed unexpectectly, recovering...: {1}".format(self.name, e)) + self.logger.debug(traceback.format_exc()) + finally: + # to be on the safe side of gf + del(pastie) + # signals to queue job is done + self.queue.task_done() + self.logger.info('{0}: Thread for saving pasties terminated'.format(self.name)) + + def save_pastie(self, pastie, timeout): + try: + self.logger.debug('{0}: queueing pastie {1} for saving'.format(self.name, pastie.url)) + self.queue.put(pastie, True, timeout) + except Full: + self.logger.error('{0}: unable to save pastie[{1}]: queue is full'.format(self.name, pastie.url)) + + +class StorageDispatcher(): + """Dispatcher will then take care of dispatching onions to the right databases. + Each database thread will read in the task and will handle it.""" + def __init__(self, logger): + self.logger = logger + self.__storage = [] + self.lock = threading.Lock() + + def add_storage(self, thread_storage): + self.__storage.append(thread_storage) + + def save_pastie(self, pastie, timeout=5): + self.logger.debug('Saving to database') + for t in self.__storage: + t.save_pastie(pastie, timeout) + +class PastieStorage(): + def __init__(self, **kwargs): + self.lookup = kwargs.get('lookup', False) + self.name = kwargs.get('name', self.__class__.__name__) + try: + self.logger.debug('{0}: initializing storage backend'.format(self.name)) + self.__init_storage__(**kwargs) + except Exception as e: + self.logger.error('{0}: unable to initialize storage backend: {1}'.format(self.name, e)) + raise + + def format_directory(self, directory): + d = datetime.now() + year = str(d.year) + month = str(d.month) + # prefix month and day with "0" if it is only one digit + if len(month) < 2: + month = "0" + month + day = str(d.day) + if len(day) < 2: + day = "0" + day + return directory + os.sep + year + os.sep + month + os.sep + day + + def __init_storage__(self, **kwargs): + raise NotImplementedError + + def __save_pastie__(self, pastie): + raise NotImplementedError + + def save_pastie(self, pastie): + try: + start = time.time() + self.logger.debug('{0}: saving pastie[{1}]'.format(self.name, pastie.url)) + self.__save_pastie__(pastie) + delta = time.time() - start + self.logger.debug('{0}: pastie[{1}] saved in {2}s'.format(self.name, pastie.url, delta)) + except Exception as e: + self.logger.error('{0}: unable to save pastie[{1}]: {2}'.format(self.name, pastie.url, e)) + raise + + def __seen_pastie__(self, pastie_id, **kwargs): + raise NotImplementedError + + def seen_pastie(self, pastie_id, **kwargs): + if not self.lookup: + return False + try: + start = time.time() + self.logger.debug('{0}: looking up pastie[{1}]'.format(self.name, pastie_id)) + res = self.__seen_pastie__(pastie_id, **kwargs) + delta = time.time() - start + self.logger.debug('{0}: pastie[{1}] looked-up in {2}s'.format(self.name, pastie_id, delta)) + return res + except Exception as e: + self.logger.error('{0}: unable to lookup pastie[{1}]: {2}'.format(self.name, pastie_id, e)) + raise + +class Notifier(object): + def __init__(self, logger, **kwargs): + self.logger = logger + + def send(self): + raise NotImplementedError() + + def scheduledEvery(self, time="10:30"): + self.logger.info(f'Scheduled task everyday as {time}') + schedule.every().day.at(time).do(self.send) + schedule.run_pending() + diff --git a/onioningestor/dbhandler.py b/onioningestor/databases/elasticsearch.py similarity index 83% rename from onioningestor/dbhandler.py rename to onioningestor/databases/elasticsearch.py index a8f6b7d..152de86 100644 --- a/onioningestor/dbhandler.py +++ b/onioningestor/databases/elasticsearch.py @@ -3,11 +3,14 @@ import traceback from elasticsearch import Elasticsearch, helpers -class DbHandlerElasticSearch: - def __init__(self, config, logger): +from onioningestor.databases import PastieStorage + +class Plugin(PastieStorage): + def __init__(self, logger, **kwargs): + self.name = kwargs.get('name') self.logger = logger self.logger.info('Creating Elasticsearch mapping') - self.config = config + self.config = kwargs self.mapping = """ { "mappings": { @@ -74,16 +77,7 @@ class DbHandlerElasticSearch: else: self.logger.error(status) - def update(self, _id, data): - if _id and data: - self.es.update( - index=self.index, - id=_id, - body={"doc":data}) - self.count() - - def save(self, data): - if data: - status = self.es.index(index=self.index,body=data) + def __save_pastie__(self, onion): + if onion: + status = self.es.index(index=self.index,body=onion.asdict()) self.count() - return status diff --git a/onioningestor/databases/telegram.py b/onioningestor/databases/telegram.py new file mode 100644 index 0000000..ba21682 --- /dev/null +++ b/onioningestor/databases/telegram.py @@ -0,0 +1,33 @@ +import sys +import requests + +from onioningestor.databases import PastieStorage + +class Plugin(PastieStorage): + + def __init__(self, logger, **kwargs): + # kwargs = {'name': 'telegram-notifer', 'chat_id': 111111, 'token': 'XXXX'} + self.name = kwargs.get('name') + self.logger = logger + self.token = kwargs.get('token') + self.chat_id = kwargs.get('chat_id') + + def __save_pastie__(self, pastie): + message = ''' +HiddenSite: {site} +Source : {url} +Monitor : {content} +Status : {status} + '''.format( + site=pastie.url, + url=pastie.source, + content=pastie.monitor, + status=pastie.status) + + url = 'https://api.telegram.org/bot{0}/sendMessage'.format(self.token) + try: + self.logger.debug('Sending message to telegram {} for pastie_id {}'.format(url, pastie)) + requests.post(url, data={'chat_id': self.chat_id, 'text':message}) + except Exception as e: + self.logger.warning("Failed to alert through telegram: {0}".format(e)) + diff --git a/onioningestor/notifiers/__init__.py b/onioningestor/notifiers/__init__.py deleted file mode 100644 index 8401ec9..0000000 --- a/onioningestor/notifiers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ - - -class notifier(object): - def __init__(self): - pass - - def scheduledEvery(self, time, summary): - pass diff --git a/onioningestor/onion.py b/onioningestor/onion.py new file mode 100644 index 0000000..6496210 --- /dev/null +++ b/onioningestor/onion.py @@ -0,0 +1,42 @@ +from datetime import datetime as dt + +class Onion(object): + """Onion class""" + def __init__(self, url, source, type, status, monitor, denylist): + self.url = url + self.source = source + self.type = type + self.status = status + self.monitor = monitor + self.denylist = denylist + self.datetime = dt.now() + + def simpleHTML(self, response): + self.simpleHTML = response + # if any match update denylist + + def onionscan(self, response): + self.onionscan = response + + def asdict(self): + d = { + 'hiddenService':self.url, + 'source':self.source, + 'type':self.type, + 'status':self.status, + 'monitor': self.monitor, + 'denylist': self.denylist, + 'dateFound': self.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")+"Z", + 'simpleHTML': self.simpleHTML, + 'onionscan':self.onionscan + } + return d + + def __lt__(self, other): + return self.datetime < other.datetime + + def __str__(self): + return self.url + + def __repr__(self): + return self.url diff --git a/onioningestor/operators/__init__.py b/onioningestor/operators/__init__.py index c2bfd1e..5514f40 100644 --- a/onioningestor/operators/__init__.py +++ b/onioningestor/operators/__init__.py @@ -1,10 +1,13 @@ import re import sys import json +from queue import Queue from itertools import islice from datetime import datetime as dt from concurrent.futures import ThreadPoolExecutor +from collections import namedtuple + class Operator: """Base class for all Operator plugins. @@ -20,7 +23,7 @@ class Operator: override other existing methods from this class. """ - def __init__(self, logger, elasticsearch, allowed_sources=None): + def __init__(self, logger, allowed_sources=None): """Override this constructor in child classes. The arguments above (artifact_types, filter_string, allowed_sources) @@ -44,10 +47,21 @@ class Operator: classes. Remember to do so *before* setting any default artifact_types. """ self.logger = logger - self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE) - self.es = elasticsearch + self.processQueue = Queue() + self.onions = {} + self.onion = namedtuple('onion',['url','source','type','index','monitor','denylist']) + deny = allowed_sources or [] + self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE) + + def handle_onion(self, url): + """Override with the same signature. - def response(self, content, onion, operator_name): + :param artifact: A single ``Artifact`` object. + :returns: None (always ignored) + """ + raise NotImplementedError() + + def response(self, status, content): """ status: success/failure content: dict @@ -55,50 +69,24 @@ class Operator: return: dict """ try: - return {operator_name: content, 'hiddenService': onion} - #except TypeError: - # return {operator_name: None, 'hiddenService': onion} + return {'status':status, 'content': content} except Exception as e: self.logger.error(e) - def handle_onion(self, db, url): - """Override with the same signature. - - :param artifact: A single ``Artifact`` object. - :returns: None (always ignored) - """ - raise NotImplementedError() - - - def _onion_is_allowed(self, response, db, type='URL'): + def _onion_is_allowed(self, content, onion): """Returns True if this is allowed by this plugin's filters.""" - # Must be in allowed_sources, if set. - if type == 'URL': - blacklist = self.blacklist.findall(response['hiddenService']) - elif type == 'HTML': - blacklist = self.blacklist.findall(response['simple-html']['HTML']) + blacklist = self.blacklist.findall(content) if blacklist: - response['simple-html'].pop('status') - response['simple-html']['status'] = 'blocked' - response['blacklist'] = list(set(blacklist)) - self.es.update(db['_id'], response) - return False - return True + onion.denylist = blacklist def collect(self, onions): for onion in onions: self.logger.info(f'thread function processing {onion}') - # Add link to database - db = self.es.save({ - 'hiddenService':onion.url, - 'monitor':'false', - 'dateAdded':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'}) - if self._onion_is_allowed( - self.response({'status':'blocked'},onion.url,'regex-blacklist'), - db, - type='URL'): - # Get data for current link - self.handle_onion(db, onion.url) + if onion.monitor: + self.handle_onion(onion) + else: + if self._onion_is_allowed(onion.url): + self.handle_onion(onion) def iter_batches(self, data, batch_size): data = iter(data) @@ -110,8 +98,11 @@ class Operator: def process(self, onions): """Process all applicable onions.""" - #print(onions) - with ThreadPoolExecutor(max_workers=10) as executor: - collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)] - for tasks in collect_tasks: - self.logger.info(tasks.result()) + for onion in onions: + self.handle_onion(onion[1]) + #self.save_pastie() + + #with ThreadPoolExecutor(max_workers=1) as executor: + # collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)] + # for tasks in collect_tasks: + # self.logger.info(tasks.result()) diff --git a/onioningestor/operators/html.py b/onioningestor/operators/html.py index d55ec64..174dd60 100644 --- a/onioningestor/operators/html.py +++ b/onioningestor/operators/html.py @@ -4,6 +4,7 @@ import json import traceback from datetime import datetime as dt from json.decoder import JSONDecodeError +from collections import Counter import requests @@ -22,10 +23,10 @@ class Plugin(Operator): This plugin collects HTML code from onion link """ - def __init__(self, logger, elasticsearch, allowed_sources, **kwargs): - super(Plugin, self).__init__(logger, elasticsearch, allowed_sources) - self.plugin_name = "simple-html" - self.logger.info(f"Initializing {self.plugin_name}") + def __init__(self, logger, **kwargs): + super(Plugin, self).__init__(logger) + self.name = kwargs['name'] + self.logger.info(f"Initializing {self.name}") self.timeout = int(kwargs["timeout"]) self.retries = int(kwargs["retries"]) @@ -83,42 +84,24 @@ class Plugin(Operator): result = content.text if result: html = BeautifulSoup(result, features="lxml") - # testing hardcorded filepath - with open( - "/home/tony/Projects/OnionScraper_v2/onion_master_list.txt", - "w", - ) as fp: - for onion in re.findall("([a-z2-7]{16,56}\.onion)", result): - fp.write("%s\n" % onion) if html: index = { "HTML": result, "title": html.title.text, "language": detect(html.text), - "date-crawled": dt.utcnow().strftime( - "%Y-%m-%dT%H:%M:%S.%f" - ) - + "Z", "status": "success", - "interestingKeywords": list( - set(self.interesting.findall(result)) - ), + "interestingKeywords": Counter(self.interesting.findall(result)), } else: index = { "HTML": result, "title": None, "language": None, - "date-crawled": dt.utcnow().strftime( - "%Y-%m-%dT%H:%M:%S.%f" - ) - + "Z", "status": "success", - "interestingKeywords": list( - set(self.interesting.findall(result)) - ), + "interestingKeywords": Counter(self.interesting.findall(result)), } - return self.response(index, onion, self.plugin_name) + return self.response("success", index) + except requests.exceptions.ConnectionError as connection_error: self.logger.error(f"Failed connecting to http://{url}") self.logger.debug(connection_error) @@ -131,10 +114,10 @@ class Plugin(Operator): self.renew_connection() if retry > self.retries: self.logger.error("[x] Max retries exceeded") - return self.response({"status": "failure"}, onion, self.plugin_name) + return self.response("failed", None) - def handle_onion(self, db, onion): - content = self.run_sessions(onion) - if content[self.plugin_name]["status"] == "success": - if self._onion_is_allowed(content, db, "HTML"): - self.es.update(db["_id"], content) + def handle_onion(self, onion): + html = self.run_sessions(onion.url) + if html['status'] == 'success': + self._onion_is_allowed(html['content']['HTML'], onion) + onion.simpleHTML(html) diff --git a/onioningestor/operators/onionscan.py b/onioningestor/operators/onionscan.py index 3367d0c..7e65029 100644 --- a/onioningestor/operators/onionscan.py +++ b/onioningestor/operators/onionscan.py @@ -1,28 +1,14 @@ -import re -import os -import sys import json import time -import random import traceback import subprocess -from uuid import uuid4 -from pathlib import Path -from datetime import datetime as dt +from threading import Timer from json.decoder import JSONDecodeError from concurrent.futures import ProcessPoolExecutor -from threading import Timer import requests -from stem.control import Controller -from stem import Signal - -from selenium import webdriver -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.firefox.firefox_binary import FirefoxBinary - -from onionscraper.operators import Operator +from onioningestor.operators import Operator class Plugin(Operator): @@ -32,41 +18,13 @@ class Plugin(Operator): sending artifacts to operators. """ def __init__(self, logger, **kwargs): + self.name = kwargs['name'] self.logger = logger - self.logger.info('Initializing OnionScanner') - screenshots = kwargs.pop('screenshots_path', None) - if screenshots: - self.screenshots = Path(screenshots) - else: - self.screenshots = Path(__file__).parents[1]/'screenshots' + self.logger.info(f'Initializing {self.name}') self.onionscan = kwargs['binpath'] - self.timeout = int(kwargs['timeout']) - self.proxy = kwargs['socks5'] - self.torControl = kwargs['TorController'] - self.retries = int(kwargs['retries']) - self.headers ={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language':'en-US,en;q=0.5', - 'DNT': '1', 'Connection': - 'keep-alive', - 'Upgrade-Insecure-Requests': '1' - } - - blacklist = kwargs['blacklist'].split(',') - self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE) - keywords = kwargs['interestingKeywords'].split(',') - self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE) - self.session = self.get_tor_session() - - def response(self, status, content, onion): - """ - status: success/failure - content: dict - onion: str - return: dict - """ - return {'status': status, 'data': content, 'onion': onion} + self.timeout = int(kwargs.get('timeout', 300)) + self.torControl = 9051 + self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T" def parseDoc(self, data): data['onionscan'].pop('simpleReport', None) @@ -75,74 +33,17 @@ class Plugin(Operator): data['onionscan']['crawls'] = [*crawls] data['hiddenService'] = hiddenService for onion in crawls.keys(): - print(onion) - with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp: - fp.write("%s\n" % onion) - #q.enqueue(self.crawl, onion) - #with open('test.json', 'w', encoding='utf-8') as f: - # json.dump(data, f, ensure_ascii=False, indent=4) + self.queueCrawl(( + 3, + self.onion( + url=onion, + source='crawled', + type='domain', + status='offline', + monitor=False, + denylist=False))) return data - def format_directory(self, directory): - d = dt.now() - year = str(d.year) - month = str(d.month) - # prefix month and day with "0" if it is only one digit - if len(month) < 2: - month = "0" + month - day = str(d.day) - if len(day) < 2: - day = "0" + day - save_path = directory/year/month/day - if not os.path.isdir(save_path): - self.logger.info("[*] Creating directory to save screenshots") - os.makedirs(save_path) - - return save_path - - def take_screenshot(self, save_path, onion): - binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver') - fp = webdriver.FirefoxProfile() - fp.set_preference('network.proxy.type', 1) - fp.set_preference('network.proxy.socks', '127.0.0.1') - fp.set_preference('network.proxy.socks_port', 9050) - fp.set_preference('network.proxy.socks_remote_dns', True) - - options = Options() - options.headless = True - driver = webdriver.Firefox( - executable_path='/home/tony/Projects/OnionScraper/geckodriver', - options=options, - firefox_profile=fp - ) - url = 'http://' + onion - driver.get(url) - uid = str(uuid4()).split('-')[0] - filename = f"{onion}_screenshot_{uid}.png" - f_name = f"{save_path}/{filename}" - driver.save_screenshot(f_name) - - driver.quit() - - if os.path.isfile(f_name): - self.logger.info(f'[*] Screenshot was taken. {f_name}') - dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z' - result = {'dateScreenshoted':dateScreenshoted,'filename':filename} - return self.response("success",result,onion) - else: - self.logger.error('[x] Unable to take screenshot') - return self.response("failure",None,onion) - - def get_tor_session(self): - try: - s = requests.session() - s.proxies = self.proxy - s.headers.update(self.headers) - except Exception as e: - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - return s - # signal TOR for a new connection def renew_connection(self): with Controller.from_port(port = self.torControl['port']) as controller: @@ -154,8 +55,7 @@ class Plugin(Operator): controller.signal(Signal.NEWNYM) # wait for the new identity to be initialized time.sleep(controller.get_newnym_wait()) - session = self.get_tor_session() - self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") + self.logger.info(f"IP is {requests.get('http://httpbin.org/ip').json()['origin']}") def handle_timeout(self, process, onion): # @@ -171,33 +71,6 @@ class Plugin(Operator): self.renew_connection() return - def run_sessions(self, onion): - retry = 0 - result = None - while True: - try: - url = 'http://'+onion - self.logger.info(url) - content = self.session.get(url) - if content.status_code == 200: - result = content.json() - except JSONDecodeError as e: - self.logger.debug(f'JSONDecodeError {e}') - result = content.text - except Exception as e: - self.logger.error(e) - self.logger.debug(traceback.print_exc()) - finally: - if result: - return self.response("success",result,onion) - else: - self.logger.info('[x] No results found retrying ...') - retry += 1 - self.renew_connection() - if retry > self.retries: - self.logger.error('[x] Max retries exceeded') - return self.response("failure",None, onion) - def run_onionscan(self, onion): self.logger.info("[*] Running onionscan on %s", onion) @@ -205,7 +78,7 @@ class Plugin(Operator): process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE) # start the timer and let it run till timeout minutes - process_timer = Timer(300,self.handle_timeout,args=[process,onion]) + process_timer = Timer(self.timeout,self.handle_timeout,args=[process,onion]) process_timer.start() # wait for the onion scan results @@ -215,47 +88,23 @@ class Plugin(Operator): if process_timer.is_alive(): process_timer.cancel() try: - return self.response("success",json.loads(stdout),onion) + return self.response( + "success", + self.parseDoc(json.loads(stdout))) except json.decoder.JSONDecodeError: - pass + return self.response( + "success", + self.parseDoc(stdout)) self.logger.info("[!!!] Process timed out for %s", onion) + return self.response("failed",stdout) - return self.response("failure",None, onion) - - def handle_onion(self, onion_tuple): - onion = onion_tuple.url - self.logger.info(f'Processing {onion} with onionscan') + def handle_onion(self, onion): try: - blacklist_URL = self.blacklist.search(onion) - if blacklist_URL: - self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}") - else: - self.logger.debug("[*] URL blacklist test: PASSED") - results = self.run_onionscan(onion) - if results['status'] == 'success':# and results['data']['webDetected'] == 'true': - content = self.run_sessions(onion) - if content['status'] == 'success': - blacklist_CONTENT = self.blacklist.search(content['data']) - if blacklist_CONTENT: - self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}") - else: - self.logger.debug("[*] CONTENT blacklist test: PASSED") - screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion) - self.logger.info("Indexing!") - doc = { - 'onionscan':json.loads(results['data']), - 'html':content['data'], - 'screenshots':screenshot['data'], - 'interestingKeywords':self.interestingKeywords.findall(content['data']) - } - return self.parseDoc(doc) - - else: - self.logger.info(f"[x] hidden service {onion} is not active") + results = self.run_onionscan(onion.url) + onion.onionscan(results) except Exception as e: self.logger.error(e) self.logger.error(traceback.print_exc()) finally: pass - #sys.exit(0) diff --git a/onioningestor/operators/screenshot.py b/onioningestor/operators/screenshot.py new file mode 100644 index 0000000..ca80663 --- /dev/null +++ b/onioningestor/operators/screenshot.py @@ -0,0 +1,261 @@ +import re +import os +import sys +import json +import time +import random +import traceback +import subprocess +from uuid import uuid4 +from pathlib import Path +from datetime import datetime as dt +from json.decoder import JSONDecodeError +from concurrent.futures import ProcessPoolExecutor +from threading import Timer + +import requests + +from stem.control import Controller +from stem import Signal + +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary + +from onioningestor.operators import Operator + + +class Plugin(Operator): + """OnionScraper main work logic. + + Handles reading the config file, calling sources, maintaining state and + sending artifacts to operators. + """ + def __init__(self, logger, **kwargs): + self.logger = logger + self.logger.info('Initializing OnionScanner') + screenshots = kwargs.pop('screenshots_path', None) + if screenshots: + self.screenshots = Path(screenshots) + else: + self.screenshots = Path(__file__).parents[1]/'screenshots' + self.onionscan = kwargs['binpath'] + self.timeout = int(kwargs['timeout']) + self.proxy = kwargs['socks5'] + self.torControl = kwargs['TorController'] + self.retries = int(kwargs['retries']) + self.headers ={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language':'en-US,en;q=0.5', + 'DNT': '1', 'Connection': + 'keep-alive', + 'Upgrade-Insecure-Requests': '1' + } + + blacklist = kwargs['blacklist'].split(',') + self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE) + keywords = kwargs['interestingKeywords'].split(',') + self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE) + self.session = self.get_tor_session() + + def response(self, status, content, onion): + """ + status: success/failure + content: dict + onion: str + return: dict + """ + return {'status': status, 'data': content, 'onion': onion} + + def parseDoc(self, data): + data['onionscan'].pop('simpleReport', None) + crawls = data['onionscan'].pop('crawls', None) + hiddenService = data['onionscan'].pop('hiddenService', None) + data['onionscan']['crawls'] = [*crawls] + data['hiddenService'] = hiddenService + for onion in crawls.keys(): + print(onion) + with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp: + fp.write("%s\n" % onion) + #q.enqueue(self.crawl, onion) + #with open('test.json', 'w', encoding='utf-8') as f: + # json.dump(data, f, ensure_ascii=False, indent=4) + return data + + def format_directory(self, directory): + d = dt.now() + year = str(d.year) + month = str(d.month) + # prefix month and day with "0" if it is only one digit + if len(month) < 2: + month = "0" + month + day = str(d.day) + if len(day) < 2: + day = "0" + day + save_path = directory/year/month/day + if not os.path.isdir(save_path): + self.logger.info("[*] Creating directory to save screenshots") + os.makedirs(save_path) + + return save_path + + def take_screenshot(self, save_path, onion): + binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver') + fp = webdriver.FirefoxProfile() + fp.set_preference('network.proxy.type', 1) + fp.set_preference('network.proxy.socks', '127.0.0.1') + fp.set_preference('network.proxy.socks_port', 9050) + fp.set_preference('network.proxy.socks_remote_dns', True) + + options = Options() + options.headless = True + driver = webdriver.Firefox( + executable_path='/home/tony/Projects/OnionScraper/geckodriver', + options=options, + firefox_profile=fp + ) + url = 'http://' + onion + driver.get(url) + uid = str(uuid4()).split('-')[0] + filename = f"{onion}_screenshot_{uid}.png" + f_name = f"{save_path}/{filename}" + driver.save_screenshot(f_name) + + driver.quit() + + if os.path.isfile(f_name): + self.logger.info(f'[*] Screenshot was taken. {f_name}') + dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z' + result = {'dateScreenshoted':dateScreenshoted,'filename':filename} + return self.response("success",result,onion) + else: + self.logger.error('[x] Unable to take screenshot') + return self.response("failure",None,onion) + + def get_tor_session(self): + try: + s = requests.session() + s.proxies = self.proxy + s.headers.update(self.headers) + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + return s + + # signal TOR for a new connection + def renew_connection(self): + with Controller.from_port(port = self.torControl['port']) as controller: + # Now we switch TOR identities to make sure we have a good connection + self.logger.info('Getting new Tor IP') + # authenticate to our local TOR controller + controller.authenticate(self.torControl['password']) + # send the signal for a new identity + controller.signal(Signal.NEWNYM) + # wait for the new identity to be initialized + time.sleep(controller.get_newnym_wait()) + session = self.get_tor_session() + self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}") + + def handle_timeout(self, process, onion): + # + # Handle a timeout from the onionscan process. + # + + try: + # kill the onionscan process + process.kill() + self.logger.info("[!!!] Killed the onionscan process.") + except: + pass + self.renew_connection() + return + + def run_sessions(self, onion): + retry = 0 + result = None + while True: + try: + url = 'http://'+onion + self.logger.info(url) + content = self.session.get(url) + if content.status_code == 200: + result = content.json() + except JSONDecodeError as e: + self.logger.debug(f'JSONDecodeError {e}') + result = content.text + except Exception as e: + self.logger.error(e) + self.logger.debug(traceback.print_exc()) + finally: + if result: + return self.response("success",result,onion) + else: + self.logger.info('[x] No results found retrying ...') + retry += 1 + self.renew_connection() + if retry > self.retries: + self.logger.error('[x] Max retries exceeded') + return self.response("failure",None, onion) + + def run_onionscan(self, onion): + self.logger.info("[*] Running onionscan on %s", onion) + + # fire up onionscan + process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE) + + # start the timer and let it run till timeout minutes + process_timer = Timer(300,self.handle_timeout,args=[process,onion]) + process_timer.start() + + # wait for the onion scan results + stdout = process.communicate()[0] + + # we have received valid results so we can kill the timer + if process_timer.is_alive(): + process_timer.cancel() + try: + return self.response("success",json.loads(stdout),onion) + except json.decoder.JSONDecodeError: + pass + + self.logger.info("[!!!] Process timed out for %s", onion) + + return self.response("failure",None, onion) + + def handle_onion(self, onion_tuple): + onion = onion_tuple.url + self.logger.info(f'Processing {onion} with onionscan') + try: + blacklist_URL = self.blacklist.search(onion) + if blacklist_URL: + self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}") + else: + self.logger.debug("[*] URL blacklist test: PASSED") + results = self.run_onionscan(onion) + if results['status'] == 'success':# and results['data']['webDetected'] == 'true': + content = self.run_sessions(onion) + if content['status'] == 'success': + blacklist_CONTENT = self.blacklist.search(content['data']) + if blacklist_CONTENT: + self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}") + else: + self.logger.debug("[*] CONTENT blacklist test: PASSED") + screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion) + self.logger.info("Indexing!") + doc = { + 'onionscan':json.loads(results['data']), + 'html':content['data'], + 'screenshots':screenshot['data'], + 'interestingKeywords':self.interestingKeywords.findall(content['data']) + } + return self.parseDoc(doc) + + else: + self.logger.info(f"[x] hidden service {onion} is not active") + except Exception as e: + self.logger.error(e) + self.logger.error(traceback.print_exc()) + finally: + pass + #sys.exit(0) diff --git a/onioningestor/operators/yara.py b/onioningestor/operators/yara.py index e23dc26..1cfa031 100644 --- a/onioningestor/operators/yara.py +++ b/onioningestor/operators/yara.py @@ -1,5 +1,5 @@ -from onionscraper.operators import Operator +from onioningestor.operators import Operator class Plugin(Operator): diff --git a/onioningestor/sources/__init__.py b/onioningestor/sources/__init__.py index 14bb4e4..9114493 100644 --- a/onioningestor/sources/__init__.py +++ b/onioningestor/sources/__init__.py @@ -1,4 +1,5 @@ from collections import namedtuple +from onioningestor.onion import Onion class Source(object): """Base class for all Source plugins. @@ -7,14 +8,16 @@ class Source(object): additional methods to child classes, consider prefixing the method name with an underscore to denote a ``_private_method``. """ - def __init__(self, name, *args, **kwargs): + def __init__(self, *args, **kwargs): """Override this constructor in child classes. The first argument must always be ``name``. Other argumentss should be url, auth, etc, whatever is needed to set up the object. """ - self.onion = namedtuple('onion', ['url','source','type']) + self.onion = Onion + def set_onionQueue(self, queue): + self.onionQueue = queue def run(self): """Run and return ``(saved_state, list(Artifact))``. @@ -28,14 +31,3 @@ class Source(object): """ raise NotImplementedError() - - def process_element(self, content, reference_link, include_nonobfuscated=False): - """Take a single source content/url and return a list of Artifacts. - This is the main work block of Source plugins, which handles - IOC extraction and artifact creation. - :param content: String content to extract from. - :param reference_link: Reference link to attach to all artifacts. - :param include_nonobfuscated: Include non-defanged URLs in output? - """ - logger.debug(f"Processing in source '{self.name}'") - diff --git a/onioningestor/sources/gist.py b/onioningestor/sources/gist.py index 15e2049..37683e7 100644 --- a/onioningestor/sources/gist.py +++ b/onioningestor/sources/gist.py @@ -17,7 +17,7 @@ import time from bs4 import BeautifulSoup -from onionscraper.sources import Source +from onioningestor.sources import Source class Plugin(Source): diff --git a/onioningestor/sources/gmail.py b/onioningestor/sources/gmail.py index a0ddf3f..c634379 100644 --- a/onioningestor/sources/gmail.py +++ b/onioningestor/sources/gmail.py @@ -17,7 +17,7 @@ import time from bs4 import BeautifulSoup -from onionscraper.sources import Source +from onioningestor.sources import Source class Plugin(Source): diff --git a/onioningestor/sources/simplefile.py b/onioningestor/sources/simplefile.py index 64c084f..6c420b4 100644 --- a/onioningestor/sources/simplefile.py +++ b/onioningestor/sources/simplefile.py @@ -16,7 +16,7 @@ from onioningestor.sources import Source class Plugin(Source): - def __init__(self, logger, name, filename): + def __init__(self, logger, name, filename, **kwargs): self.logger = logger self.name = name self.filename = filename @@ -28,10 +28,17 @@ class Plugin(Source): filepath = Path(__file__).parents[2]/self.filename with open(filepath, 'r') as fp: lines = fp.read().splitlines() - # just testing - os.remove(self.filename) for onion in lines: - items.append(self.onion(url=onion,source='simple-file',type='domain')) - #yield self.onion(url=onion,source='simple-file',type='domain') - return items + self.onionQueue.put( + ( + 2, + self.onion( + url=onion, + source=self.name, + type='domain', + status='offline', + monitor=False, + denylist=False) + ) + ) diff --git a/requirements.txt b/requirements/prod.txt similarity index 82% rename from requirements.txt rename to requirements/prod.txt index 1a07b5e..d7eaa75 100644 --- a/requirements.txt +++ b/requirements/prod.txt @@ -4,11 +4,14 @@ chardet==3.0.4 click==7.1.2 elasticsearch==7.8.0 idna==2.10 +langdetect==1.0.8 lxml==4.5.1 PySocks==1.7.1 PyYAML==5.3.1 requests==2.24.0 +schedule==0.6.0 selenium==3.141.0 +six==1.15.0 soupsieve==2.0.1 stem==1.8.0 urllib3==1.25.9