Done major code changes and reviews

4 years ago · e2b7c8346d
parent b76af6d57e
commit e2b7c8346d
21 changed files with 864 additions and 553 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+monitoring.*
 onion_master_list.*
 webui
 templates
--- a/example.yml
+++ b/example.yml
@ -1,86 +0,0 @@
-# This is an example ThreatIngestor config file with some preconfigured RSS
-# sources, feeding extracted artifacts into a CSV file.
-
-general:
-    # Run forever, check feeds once an hour.
-    daemon: True
-    sleep: 10
-    onion_validation: ([a-z2-7]{16,56}\.onion)
-    blacklist: pedo,xxx,infant,loli,porn,child,abuse,sex,drug,cocaine,dope,zoo,daddy,daughter,boy,girl,young,muder,cocks,year,old
-    interestingKeywords: t.me,rss,xml,atom,dataleak,breach,blog,ransomware,source code,data breach,bank,logs,hack
-    elasticsearch:
-            index: darkweb
-            port : 9200
-            host : 127.0.0.1
-
-sources:
-    # A few threat intel blogs to get you started!
-    - name: simple-text-file
-      module: simplefile
-      filename: onion_master_list.txt
-
-    #  - name: source-gist
-    #    module: gist
-    #    url: https://gist.github.com/search?l=Text&q=.onion
-
-    #  - name: source-reddit
-    #    module: reddit
-    #    url: https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000
-    #    feed_type: messy
-    #
-    #  - name: pastebin
-    #    module: pastebin-account
-    #    url: https://gist.github.com/search?l=Text&q=.onion
-    #    feed_type: messy
-    #
-    #  - name: hunchly-report
-    #    module: gmail-hunchly
-    #    url: https://gist.github.com/search?l=Text&q=.onion
-    #    feed_type: messy
-    #
-    #  - name: onionland-search
-    #    module: collect-onions
-    #    url: http://3bbaaaccczcbdddz.onion/discover
-    #    feed_type: messy
-    #
-    #  - name: torch
-    #    module: collect-onions
-    #    url: http://xmh57jrzrnw6insl.onion
-    #    feed_type: messy
-
-
-operators:
-        - name: simple-html
-          module: html
-          socks5:
-                  http: 'socks5h://127.0.0.1:9050'
-                  https: 'socks5h://127.0.0.1:9050'
-          TorController:
-                  port: 9051
-                  password: your-torcontroller-password-here
-
-        - name: simple-screenshot
-          module: screenshot
-          screenshots_path: null
-
-        - name: onionscan-go
-          module: onionscan
-          binpath: /home/tony/go/bin/onionscan
-
-
-    #  - name: yara-rule
-    #    module: yara
-    #    filename: categories.yar
-    #    base_score: 50
-    #
-    #  - name: regex-match
-    #    module: regex
-    #    keywords: test,test2
-    #    base_score: 20
-
-notifiers:
-    # Simple telegram notifier
-  - name: telegram-notifer
-    module: telegram
-    chat_id:
-    token:
--- a/onioningestor.yml
+++ b/onioningestor.yml
@ -0,0 +1,106 @@
+# This is an example OnionIngestor config file with some preconfigured configurations
+# Storage Engines elasticsearch and telegram are configured
+
+general:
+    # Run forever, check feeds once an hour.
+    daemon: True
+    sleep: 10
+    onion_validation: ([a-z2-7]{16,56}\.onion)
+    blacklist: pedo,porn,child
+    interestingKeywords: your,keywords,here
+    save-thread: no         # Use a separate thread to save onions
+
+monitor:
+    filename: monitoring.txt
+
+sources:
+    # A few threat intel blogs to get you started!
+    - name: simple-text-file
+      module: simplefile
+      filename: onion_master_list.txt
+
+    #  - name: source-gist
+    #    module: gist
+    #    url: https://gist.github.com/search?l=Text&q=.onion
+
+    #  - name: source-reddit
+    #    module: reddit
+    #    url: https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000
+    #    feed_type: messy
+    #
+    #  - name: pastebin
+    #    module: pastebin-account
+    #    url: https://gist.github.com/search?l=Text&q=.onion
+    #    feed_type: messy
+    #
+    #  - name: hunchly-report
+    #    module: gmail-hunchly
+    #    url: https://gist.github.com/search?l=Text&q=.onion
+    #    feed_type: messy
+    #
+    #  - name: onionland-search
+    #    module: collect-onions
+    #    url: http://3bbaaaccczcbdddz.onion/discover
+    #    feed_type: messy
+    #
+    #  - name: torch
+    #    module: collect-onions
+    #    url: http://xmh57jrzrnw6insl.onion
+    #    feed_type: messy
+
+
+operators:
+   - name: simple-html
+     module: html
+     timeout: 300
+     retries: 2
+     interestingKeywords: your,keywords,here
+     socks5:
+         http: 'socks5h://127.0.0.1:9050'
+         https: 'socks5h://127.0.0.1:9050'
+     TorController:
+         port: 9051
+         password: your-tor-controller-password
+
+   - name: onionscan-go
+     module: onionscan
+     binpath: your-onionscan-binary-path
+
+   - name: simple-screenshot
+     module: screenshot
+     screenshots_path: null
+
+     #   - name: yara-rule
+     #     module: yara
+     #     filename: categories.yar
+     #     base_score: 50
+     #    
+     #   - name: regex-match
+     #     module: regex
+     #     keywords: test,test2
+     #     base_score: 20
+
+database_Engines:
+  - name: telegram-notifer  #Simple Telegram notifier
+    module: telegram
+    chat_id: your-telegram-chat-id
+    token: your-telegram-token
+
+  - name: elasticsearch
+    module: elasticsearch
+    index: your-index-name
+    port : 9200
+    host : 127.0.0.1
+
+    #  - name: email
+    #    module: send_email
+    #    alert: no             # Enable/disable email alerts
+    #    from: alert@example.com
+    #    to: alert@example.com
+    #    server: 127.0.0.1     # Address of the server (hostname or IP)
+    #    port: 25              # Outgoing SMTP port: 25, 587, ...
+    #    tls: no               # Enable/disable tls support
+    #    username: ''          # (optional) Username for authentication. Leave blank for no authentication.
+    #    password: ''          # (optional) Password for authentication. Leave blank for no authentication.
+    #    subject: '[onioningestor] - {subject}'
+    #    size-limit: 1048576   # Size limit for pastie, above it's sent as attachement
--- a/onioningestor/init.py
+++ b/onioningestor/init.py
@ -1,12 +1,15 @@
 import sys
 import time
+import queue
 import traceback
+import threading
 import collections
+from queue import Queue

 from . import config
-from . import dbhandler
 from . import loghandler

+from onioningestor.databases import StorageDispatcher, StorageThread, StorageSync

 class Ingestor:
    """ThreatIngestor main work logic.
@ -18,37 +21,51 @@ class Ingestor:
        # Load logger
        log = loghandler.LoggerHandler(args.logLevel)
        self.logger = log.start_logging()
+        
        # Load config
        self.config = config.Config(args.configFile, self.logger)
        self.blacklist = self.config.blacklist()

-        # Load Elasticsearch.
-        try:
-            self.es = dbhandler.DbHandlerElasticSearch(
-                    self.config.elasticsearch(),
-                    self.logger)
-        except Exception as e:
-            # Error loading elasticsearch.
-            self.logger.error(e)
-            self.logger.debug(traceback.print_exc())
-            sys.exit(1)
+        # Create Queues
+        self.queue = self.config.monitorQueue()

+        # Get asynchronously o synchronously save
+        self.save_thread = self.config.save_thread()

-        # Instantiate plugins.
-        try:
-            self.logger.info("Initializing sources")
-            self.sources = {name: source(self.logger, **kwargs)
-                            for name, source, kwargs in self.config.sources()}
+        # Track some statistics about artifacts in a summary object.
+        self.summary = collections.Counter()

-            self.logger.info("initializing operators")
-            self.operators = {name: operator(self.logger, self.es, self.blacklist, **kwargs)
+        # Threads
+        self.threads = []
+        try:
+            # Load Storage Engines - ElasticSearch, Telegram, Twitter etc
+            self.storage = StorageDispatcher(self.logger)
+
+            for name, db, kwargs in self.config.database_engines():
+                # start the threads handling database storage if needed
+                if self.save_thread:
+                    self.logger.debug(f"Starting daemon thread for {str(db)}")
+                    t = StorageThread(db(self.logger, **kwargs))
+                    self.threads.append(t)
+                    t.setDaemon(True)
+                    t.start()
+                # save onions synchronously
+                else:
+                    s = StorageSync(db(self.logger, **kwargs))
+                    self.storage.add_storage(s)
+
+            if self.save_thread:
+                self.logger.info("Onions will be saved asynchronously")
+            else:
+                self.logger.info("Onions will be saved synchronously")
+
+            # Instantiate operator plugins.
+            self.logger.debug("initializing operators")
+            self.operators = {name: operator(self.logger, **kwargs)
                              for name, operator, kwargs in self.config.operators()}

-            self.logger.info("initializing notifiers")
-            #self.notifiers = {name: operator(**kwargs)
-            #                  for name, operator, kwargs in self.config.notifiers()}
        except Exception as e:
-            # Error loading elasticsearch.
+            # Error loading starting plugins.
            self.logger.error(e)
            self.logger.debug(traceback.print_exc())
            sys.exit(1)
@ -56,51 +73,54 @@ class Ingestor:

    def run(self):
        """Run once, or forever, depending on config."""
-        if self.config.daemon():
-            self.logger.info("Running forever, in a loop")
-            self.run_forever()
-        else:
-            self.logger.info("Running once, to completion")
-            self.run_once()
+        self.run_once()
+        #if self.config.daemon():
+        #    self.logger.info("Running forever, in a loop")
+        #    self.run_forever()
+        #else:
+        #    self.logger.info("Running once, to completion")
+        #    self.run_once()


    def run_once(self):
        """Run each source once, passing artifacts to each operator."""
-        # Track some statistics about artifacts in a summary object.
-        summary = collections.Counter()
-
-        for source in self.sources:
-            # Run the source to collect artifacts.
-            self.logger.info(f"Running source '{source}'")
-            try:
-                # get the generator of onions
-                onions = self.sources[source].run()
-            except Exception as e:
-                self.logger.error(e)
-                self.logger.error(traceback.print_exc())
-                continue
-
-            # Process onions with each operator.
-            for operator in self.operators:
-                self.logger.info(f"Processing found onions with operator '{operator}'")
+        # Start collecting sources
+        self.collect_sources()
+
+        # Sources will fill various queues 
+        # MonitorQueue has priority high
+        # OnionQueue are those found in clearnet medium
+        # crawlQueue are those found crawling onionlinks low
+        onions = list(self.queue.queue)
+        done = False
+        if onions:
+            while not done:
                try:
-                    self.operators[operator].process(onions)
-                    # Save the source onion with collected data
+                    ## Process onions with each operator.
+                    for operator in self.operators:
+                        self.logger.info(f"Processing found onions with operator '{operator}'")
+                        # Process list of onions
+                        self.operators[operator].process(onions)
+                        done = True
+                    ## Save Onions for each storage
+                    for onion in onions:
+                        self.storage.save_pastie(onion[1], 30)
                except Exception as e:
                    self.logger.error(e)
                    self.logger.error(traceback.print_exc())
-                    continue
-
-
-
-#            # Record stats and update the summary.
-#            types = artifact_types(doc.get('interestingKeywords'))
-#            summary.update(types)
-#            for artifact_type in types:
-#                self.logger.info(f'types[artifact_type]')
-
-        # Log the summary.
-        self.logger.info(f"New artifacts: {dict(summary)}")
+                    break
+                except KeyboardInterrupt:
+                    print('')
+                    self.logger.info("Ctrl-c received! Sending kill to threads...")
+                    for t in self.threads:
+                        t.kill_received = True
+                    self.logger.info('Exiting')
+                    sys.exit(0)
+        else:
+            for t in self.threads:
+                t.kill_received = True
+            self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
+            time.sleep(self.config.sleep())


    def run_forever(self):
@ -108,20 +128,22 @@ class Ingestor:
        while True:
            self.run_once()

-            self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
-            time.sleep(self.config.sleep())
-
-
-def artifact_types(artifact_list):
-    """Return a dictionary with counts of each artifact type."""
-    types = {}
-    for artifact in artifact_list:
-        artifact_type = artifact.__class__.__name__.lower()
-        if artifact_type in types:
-            types[artifact_type] += 1
-        else:
-            types[artifact_type] = 1
-
-    return types

+    def collect_sources(self):
+        self.logger.debug("Initializing sources")
+        for name, collect, kwargs in self.config.sources():
+            # Run the source to collect onion links from clear net.
+            self.logger.info(f"Running source '{name}'")
+            try:
+                # get the generator of onions
+                source = collect(self.logger, **kwargs)
+                source.set_onionQueue(self.queue) #priority 2
+                t = source.run()
+                self.threads.append(t)
+                #self.logger.info(f'Starting of thread: {t.currentThread().name}')
+                #t.start()
+            except Exception as e:
+                self.logger.error(e)
+                self.logger.error(traceback.print_exc())
+                continue

--- a/onioningestor/main.py
+++ b/onioningestor/main.py
@ -38,7 +38,7 @@ from onioningestor import Ingestor

 # Load arguments from user
 parser = argparse.ArgumentParser(
-        prog='onionscraper',
+        prog='onioningestor',
        description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
 parser.add_argument('-c', '--config',dest="configFile", required = True, help='Path to config file')
 parser.add_argument("--log", dest="logLevel",default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level, default is INFO")
--- a/onioningestor/config.py
+++ b/onioningestor/config.py
@ -1,22 +1,18 @@
 import io
 import importlib
 import traceback
-
+from queue import PriorityQueue
+from collections import namedtuple
 import yaml

 from pathlib import Path

+from onioningestor.onion import Onion

 SOURCE = "onioningestor.sources"
 OPERATOR = "onioningestor.operators"
-INTERNAL_OPTIONS = [
-    "saved_state",
-    "module",
-    "credentials",
-]
-ARTIFACT_TYPES = "artifact_types"
-FILTER_STRING = "filter"
-ALLOWED_SOURCES = "allowed_sources"
+DATABASE_ENGINE = "onioningestor.databases"
+
 NAME = "name"


@ -43,16 +39,15 @@ class Config:
            module = importlib.import_module(".".join([plugin_type, plugin]))
            return module.Plugin
        except Exception as e:
-            print(e)
-            print(traceback.print_exc())
+            self.logger.error(e)
+            self.logger.debug(traceback.print_exc())

    def daemon(self):
        """Returns boolean, are we daemonizing?"""
        return self.config["general"]["daemon"]

-    def elasticsearch(self):
-        """Returns elasticsaerch config"""
-        return self.config["general"]["elasticsearch"]
+    def save_thread(self):
+        return self.config["general"].get("save-thread", False)

    def sleep(self):
        """Returns number of seconds to sleep between iterations, if daemonizing."""
@ -61,36 +56,53 @@ class Config:
    def blacklist(self):
        return self.config["general"]["blacklist"].split(",")

-    #    def onionscanner(self):
-    #        """Returns onionscanner config dict"""
-    #        screenshots = self.config['onionscanner'].pop('screenshots_path', None)
-    #        if screenshots:
-    #            self.config['onionscanner']['screenshots_path'] = Path(screenshots)
-    #        else:
-    #            self.config['onionscanner']['screenshots_path'] = Path(__file__).parents[1]/'screenshots'
-    #        blacklist = self.config['onionscanner'].pop('blacklist', None)
-    #        if blacklist:
-    #            self.config['onionscanner']['blacklist'] = blacklist.split(',')
-    #        interestingKeywords = self.config['onionscanner'].pop('interestingKeywords', None)
-    #        if interestingKeywords:
-    #            self.config['onionscanner']['interestingKeywords'] = blacklist.split(',')
-    #        return self.config['onionscanner']
-
-    def notifiers(self):
-        """Returns notifiers config dictionary."""
-        return self.config.get("notifiers", {})
+    def monitorQueue(self):
+        fp = self.config["monitor"].get("filename", False)
+        q = PriorityQueue(maxsize=0)
+        if fp:
+            with open(fp, 'r') as f:
+                monitorOnions = f.read().splitlines()
+            for monitor in monitorOnions:
+                q.put((
+                    1,
+                    Onion(
+                    url=monitor,
+                    source='monitor',
+                    type='domain',
+                    status='offline',
+                    monitor=True,
+                    denylist=False)))
+            return q
+        else:
+            return None

    def logging(self):
        """Returns logging config dictionary."""
        return self.config.get("logging", {})

-    def credentials(self, credential_name):
-        """Return a dictionary with the specified credentials."""
-        for credential in self.config["credentials"]:
-            for key, value in credential.items():
-                if key == NAME and value == credential_name:
-                    return credential
-        return {}
+    def database_engines(self):
+        """Return a list of (name, Source class, {kwargs}) tuples.
+        :raises: threatingestor.exceptions.PluginError
+        """
+        engines = []
+
+        for engine in self.config["database_Engines"]:
+            kwargs = {}
+            for key, value in engine.items():
+                kwargs[key] = value
+            # load and initialize the plugin
+            self.logger.debug(f"Found database engine '{engine[NAME]}'")
+            kwargs.pop('module',None)
+            engines.append(
+                    (
+                    engine[NAME],
+                    self._load_plugin(DATABASE_ENGINE, engine["module"]),
+                    kwargs
+                    )
+            )
+
+        self.logger.debug(f"Found {len(engines)} total database engines")
+        return engines

    def sources(self):
        """Return a list of (name, Source class, {kwargs}) tuples.
@ -101,25 +113,19 @@ class Config:
        for source in self.config["sources"]:
            kwargs = {}
            for key, value in source.items():
-                if key not in INTERNAL_OPTIONS:
-                    kwargs[key] = value
-
-                elif key == "credentials":
-                    # Grab these named credentials
-                    credential_name = value
-                    for credential_key, credential_value in self.credentials(
-                        credential_name
-                    ).items():
-                        if credential_key != NAME:
-                            kwargs[credential_key] = credential_value
-
+                kwargs[key] = value
            # load and initialize the plugin
-            self.logger.info(f"Found source '{source[NAME]}'")
+            self.logger.debug(f"Found source '{source[NAME]}'")
+            kwargs.pop('module',None)
            sources.append(
-                (source[NAME], self._load_plugin(SOURCE, source["module"]), kwargs)
+                    (
+                    source[NAME],
+                    self._load_plugin(SOURCE, source["module"]),
+                    kwargs
+                    )
            )

-        self.logger.info(f"Found {len(sources)} total sources")
+        self.logger.debug(f"Found {len(sources)} total sources")
        return sources

    def operators(self):
@ -130,44 +136,10 @@ class Config:
        for operator in self.config["operators"]:
            kwargs = {}
            for key, value in operator.items():
-                if key not in INTERNAL_OPTIONS:
-                    if key == ARTIFACT_TYPES:
-                        # parse out special artifact_types option
-                        artifact_types = []
-                        for artifact in value:
-                            try:
-                                artifact_types.append(
-                                    threatingestor.artifacts.STRING_MAP[
-                                        artifact.lower().strip()
-                                    ]
-                                )
-                            except KeyError:
-                                # ignore invalid artifact types
-                                pass
-                        kwargs[key] = artifact_types
-
-                    elif key == FILTER_STRING:
-                        # pass in special filter_string option
-                        kwargs["filter_string"] = value
-
-                    elif key == NAME:
-                        # exclude name key from operator kwargs, since it's not used
-                        pass
-
-                    else:
-                        kwargs[key] = value
-
-                elif key == "credentials":
-                    # Grab these named credentials
-                    credential_name = value
-                    for credential_key, credential_value in self.credentials(
-                        credential_name
-                    ).items():
-                        if credential_key != NAME:
-                            kwargs[credential_key] = credential_value
-
+                kwargs[key] = value
            # load and initialize the plugin
-            self.logger.info(f"Found operator '{operator[NAME]}'")
+            self.logger.debug(f"Found operator '{operator[NAME]}'")
+            kwargs.pop('module',None)
            operators.append(
                (
                    operator[NAME],
@ -176,5 +148,5 @@ class Config:
                )
            )

-        self.logger.info(f"Found {len(operators)} total operators")
+        self.logger.debug(f"Found {len(operators)} total operators")
        return operators
--- a/onioningestor/databases/init.py
+++ b/onioningestor/databases/init.py
@ -0,0 +1,149 @@
+import sys
+import time
+import schedule
+import threading
+
+class StorageScheduler():
+    def __init__(self, storage, **kwargs):
+        self.storage = storage
+        self.name = self.storage.name
+
+    def save_pastie(self, pastie, timeout):
+        raise NotImplementedError
+
+
+class StorageSync(StorageScheduler):
+    ### synchronously save onions ###
+    def save_pastie(self, pastie, timeout):
+        self.storage.save_pastie(pastie)
+
+
+# LATER: implement an async class
+class StorageThread(threading.Thread, StorageScheduler):
+    def __init__(self, logger, storage, **kwargs):
+        threading.Thread.__init__(self)
+        StorageScheduler.__init__(self, storage, **kwargs)
+        self.logger = logger
+        try:
+            size = int(kwargs['queue_size'])
+        except Exception:
+            size = 0
+        self.queue = Queue(size)
+        self.kill_received = False
+
+    def run(self):
+        self.logger.info('{0}: Thread for saving pasties started'.format(self.name))
+        # loop over the queue
+        while not self.kill_received:
+            # pastie = None
+            try:
+                # grabs pastie from queue
+                pastie = self.queue.get(True, 5)
+                # save the pasties in each storage
+                self.storage.save_pastie(pastie)
+            except Empty:
+                pass
+            # catch unknown errors
+            except Exception as e:
+                self.logger.error("{0}: Thread for saving pasties crashed unexpectectly, recovering...: {1}".format(self.name, e))
+                self.logger.debug(traceback.format_exc())
+            finally:
+                # to be on the safe side of gf
+                del(pastie)
+                # signals to queue job is done
+                self.queue.task_done()
+        self.logger.info('{0}: Thread for saving pasties terminated'.format(self.name))
+
+    def save_pastie(self, pastie, timeout):
+        try:
+            self.logger.debug('{0}: queueing pastie {1} for saving'.format(self.name, pastie.url))
+            self.queue.put(pastie, True, timeout)
+        except Full:
+            self.logger.error('{0}: unable to save pastie[{1}]: queue is full'.format(self.name, pastie.url))
+
+
+class StorageDispatcher():
+    """Dispatcher will then take care of dispatching onions to the right databases. 
+    Each database thread will read in the task and will handle it."""
+    def __init__(self, logger):
+        self.logger = logger
+        self.__storage = []
+        self.lock = threading.Lock()
+
+    def add_storage(self, thread_storage):
+        self.__storage.append(thread_storage)
+
+    def save_pastie(self, pastie, timeout=5):
+        self.logger.debug('Saving to database')
+        for t in self.__storage:
+            t.save_pastie(pastie, timeout)
+
+class PastieStorage():
+    def __init__(self, **kwargs):
+        self.lookup = kwargs.get('lookup', False)
+        self.name = kwargs.get('name', self.__class__.__name__)
+        try:
+            self.logger.debug('{0}: initializing storage backend'.format(self.name))
+            self.__init_storage__(**kwargs)
+        except Exception as e:
+            self.logger.error('{0}: unable to initialize storage backend: {1}'.format(self.name, e))
+            raise
+
+    def format_directory(self, directory):
+        d = datetime.now()
+        year = str(d.year)
+        month = str(d.month)
+        # prefix month and day with "0" if it is only one digit
+        if len(month) < 2:
+            month = "0" + month
+        day = str(d.day)
+        if len(day) < 2:
+            day = "0" + day
+        return directory + os.sep + year + os.sep + month + os.sep + day
+
+    def __init_storage__(self, **kwargs):
+        raise NotImplementedError
+
+    def __save_pastie__(self, pastie):
+        raise NotImplementedError
+
+    def save_pastie(self, pastie):
+        try:
+            start = time.time()
+            self.logger.debug('{0}: saving pastie[{1}]'.format(self.name, pastie.url))
+            self.__save_pastie__(pastie)
+            delta = time.time() - start
+            self.logger.debug('{0}: pastie[{1}] saved in {2}s'.format(self.name, pastie.url, delta))
+        except Exception as e:
+            self.logger.error('{0}: unable to save pastie[{1}]: {2}'.format(self.name, pastie.url, e))
+            raise
+
+    def __seen_pastie__(self, pastie_id, **kwargs):
+        raise NotImplementedError
+
+    def seen_pastie(self, pastie_id, **kwargs):
+        if not self.lookup:
+            return False
+        try:
+            start = time.time()
+            self.logger.debug('{0}: looking up pastie[{1}]'.format(self.name, pastie_id))
+            res = self.__seen_pastie__(pastie_id, **kwargs)
+            delta = time.time() - start
+            self.logger.debug('{0}: pastie[{1}] looked-up in {2}s'.format(self.name, pastie_id, delta))
+            return res
+        except Exception as e:
+            self.logger.error('{0}: unable to lookup pastie[{1}]: {2}'.format(self.name, pastie_id, e))
+            raise
+
+class Notifier(object):
+    def __init__(self, logger, **kwargs):
+        self.logger = logger
+    
+    def send(self):
+        raise NotImplementedError()
+
+    def scheduledEvery(self, time="10:30"):
+        self.logger.info(f'Scheduled task everyday as {time}')
+        schedule.every().day.at(time).do(self.send)
+        schedule.run_pending()
+
--- a/onioningestor/databases/elasticsearch.py
+++ b/onioningestor/databases/elasticsearch.py
@ -3,11 +3,14 @@ import traceback

 from elasticsearch import Elasticsearch, helpers

-class DbHandlerElasticSearch:
-    def __init__(self, config, logger):
+from onioningestor.databases import PastieStorage
+
+class Plugin(PastieStorage):
+    def __init__(self, logger, **kwargs):
+        self.name = kwargs.get('name')
        self.logger = logger
        self.logger.info('Creating Elasticsearch mapping')
-        self.config = config
+        self.config = kwargs
        self.mapping = """
        {
          "mappings": {
@ -74,16 +77,7 @@ class DbHandlerElasticSearch:
        else:
            self.logger.error(status)

-    def update(self, _id, data):
-        if _id and data:
-            self.es.update(
-                    index=self.index,
-                    id=_id,
-                    body={"doc":data})
-            self.count()
-
-    def save(self, data):
-        if data:
-            status = self.es.index(index=self.index,body=data)
+    def __save_pastie__(self, onion):
+        if onion:
+            status = self.es.index(index=self.index,body=onion.asdict())
            self.count()
-            return status
--- a/onioningestor/databases/telegram.py
+++ b/onioningestor/databases/telegram.py
@ -0,0 +1,33 @@
+import sys
+import requests
+
+from onioningestor.databases import PastieStorage
+
+class Plugin(PastieStorage):
+
+    def __init__(self, logger, **kwargs):
+        # kwargs = {'name': 'telegram-notifer', 'chat_id': 111111, 'token': 'XXXX'}
+        self.name = kwargs.get('name')
+        self.logger = logger
+        self.token = kwargs.get('token')
+        self.chat_id = kwargs.get('chat_id')
+
+    def __save_pastie__(self, pastie):
+        message = '''
+HiddenSite:        {site}
+Source    :        {url}
+Monitor   :        {content}
+Status    :        {status}        
+    '''.format(
+            site=pastie.url,
+            url=pastie.source,
+            content=pastie.monitor,
+            status=pastie.status)
+
+        url = 'https://api.telegram.org/bot{0}/sendMessage'.format(self.token)
+        try:
+            self.logger.debug('Sending message to telegram {} for pastie_id {}'.format(url, pastie))
+            requests.post(url, data={'chat_id': self.chat_id, 'text':message})
+        except Exception as e:
+            self.logger.warning("Failed to alert through telegram: {0}".format(e))
+
--- a/onioningestor/notifiers/init.py
+++ b/onioningestor/notifiers/init.py
@ -1,8 +0,0 @@
-
-
-class notifier(object):
-    def __init__(self):
-        pass
-
-    def scheduledEvery(self, time, summary):
-        pass
--- a/onioningestor/onion.py
+++ b/onioningestor/onion.py
@ -0,0 +1,42 @@
+from datetime import datetime as dt
+
+class Onion(object):
+    """Onion class"""
+    def __init__(self, url, source, type, status, monitor, denylist):
+        self.url = url
+        self.source = source
+        self.type = type
+        self.status = status
+        self.monitor = monitor
+        self.denylist = denylist
+        self.datetime = dt.now()
+
+    def simpleHTML(self, response):
+        self.simpleHTML = response
+        # if any match update denylist
+
+    def onionscan(self, response):
+        self.onionscan = response
+    
+    def asdict(self):
+        d  = {
+                'hiddenService':self.url,
+                'source':self.source,
+                'type':self.type,
+                'status':self.status,
+                'monitor': self.monitor,
+                'denylist': self.denylist,
+                'dateFound': self.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")+"Z",
+                'simpleHTML': self.simpleHTML,
+                'onionscan':self.onionscan
+                }
+        return d
+
+    def __lt__(self, other):
+        return self.datetime < other.datetime
+
+    def __str__(self):
+        return self.url
+
+    def __repr__(self):
+        return self.url
--- a/onioningestor/operators/init.py
+++ b/onioningestor/operators/init.py
@ -1,10 +1,13 @@
 import re
 import sys
 import json
+from queue import Queue
 from itertools import islice
 from datetime import datetime as dt
 from concurrent.futures import ThreadPoolExecutor

+from collections import namedtuple
+

 class Operator:
    """Base class for all Operator plugins.
@ -20,7 +23,7 @@ class Operator:
    override other existing methods from this class.
    """
    
-    def __init__(self, logger, elasticsearch, allowed_sources=None):
+    def __init__(self, logger, allowed_sources=None):
        """Override this constructor in child classes.

        The arguments above (artifact_types, filter_string, allowed_sources)
@ -44,10 +47,21 @@ class Operator:
        classes. Remember to do so *before* setting any default artifact_types.
        """
        self.logger = logger
-        self.blacklist = re.compile('|'.join([re.escape(word) for word in allowed_sources]), re.IGNORECASE)
-        self.es = elasticsearch
+        self.processQueue = Queue()
+        self.onions = {}
+        self.onion = namedtuple('onion',['url','source','type','index','monitor','denylist'])
+        deny = allowed_sources or []
+        self.blacklist = re.compile('|'.join([re.escape(word) for word in deny]), re.IGNORECASE)
+
+    def handle_onion(self, url):
+        """Override with the same signature.

-    def response(self, content, onion, operator_name):
+        :param artifact: A single ``Artifact`` object.
+        :returns: None (always ignored)
+        """
+        raise NotImplementedError()
+
+    def response(self, status, content):
        """
        status: success/failure
        content: dict
@ -55,50 +69,24 @@ class Operator:
        return: dict
        """
        try:
-            return {operator_name: content, 'hiddenService': onion}
-        #except TypeError:
-        #    return {operator_name: None, 'hiddenService': onion}
+            return {'status':status, 'content': content}
        except Exception as e:
            self.logger.error(e)

-    def handle_onion(self, db, url):
-        """Override with the same signature.
-
-        :param artifact: A single ``Artifact`` object.
-        :returns: None (always ignored)
-        """
-        raise NotImplementedError()
-
-
-    def _onion_is_allowed(self, response, db, type='URL'):
+    def _onion_is_allowed(self, content, onion):
        """Returns True if this is allowed by this plugin's filters."""
-        # Must be in allowed_sources, if set.
-        if type == 'URL':
-            blacklist = self.blacklist.findall(response['hiddenService'])
-        elif type == 'HTML':
-            blacklist = self.blacklist.findall(response['simple-html']['HTML'])
+        blacklist = self.blacklist.findall(content)
        if blacklist:
-            response['simple-html'].pop('status')
-            response['simple-html']['status'] = 'blocked'
-            response['blacklist'] = list(set(blacklist))
-            self.es.update(db['_id'], response)
-            return False
-        return True
+            onion.denylist = blacklist

    def collect(self, onions):
        for onion in onions:
            self.logger.info(f'thread function processing {onion}')
-            # Add link to database
-            db = self.es.save({
-                'hiddenService':onion.url,
-                'monitor':'false',
-                'dateAdded':dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'})
-            if self._onion_is_allowed(
-                    self.response({'status':'blocked'},onion.url,'regex-blacklist'),
-                    db,
-                    type='URL'):
-                # Get data for current link
-                self.handle_onion(db, onion.url)
+            if onion.monitor:
+                self.handle_onion(onion)
+            else:
+                if self._onion_is_allowed(onion.url):
+                    self.handle_onion(onion)

    def iter_batches(self, data, batch_size):
        data = iter(data)
@ -110,8 +98,11 @@ class Operator:

    def process(self, onions):
        """Process all applicable onions."""
-        #print(onions)
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
-            for tasks in collect_tasks:
-                self.logger.info(tasks.result())
+        for onion in onions:
+            self.handle_onion(onion[1])
+        #self.save_pastie()
+        
+        #with ThreadPoolExecutor(max_workers=1) as executor:
+        #    collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
+        #    for tasks in collect_tasks:
+        #        self.logger.info(tasks.result())
--- a/onioningestor/operators/html.py
+++ b/onioningestor/operators/html.py
@ -4,6 +4,7 @@ import json
 import traceback
 from datetime import datetime as dt
 from json.decoder import JSONDecodeError
+from collections import Counter

 import requests

@ -22,10 +23,10 @@ class Plugin(Operator):
    This plugin collects HTML code from onion link
    """

-    def __init__(self, logger, elasticsearch, allowed_sources, **kwargs):
-        super(Plugin, self).__init__(logger, elasticsearch, allowed_sources)
-        self.plugin_name = "simple-html"
-        self.logger.info(f"Initializing {self.plugin_name}")
+    def __init__(self, logger, **kwargs):
+        super(Plugin, self).__init__(logger)
+        self.name = kwargs['name']
+        self.logger.info(f"Initializing {self.name}")

        self.timeout = int(kwargs["timeout"])
        self.retries = int(kwargs["retries"])
@ -83,42 +84,24 @@ class Plugin(Operator):
                    result = content.text
                    if result:
                        html = BeautifulSoup(result, features="lxml")
-                        # testing hardcorded filepath
-                        with open(
-                            "/home/tony/Projects/OnionScraper_v2/onion_master_list.txt",
-                            "w",
-                        ) as fp:
-                            for onion in re.findall("([a-z2-7]{16,56}\.onion)", result):
-                                fp.write("%s\n" % onion)
                        if html:
                            index = {
                                "HTML": result,
                                "title": html.title.text,
                                "language": detect(html.text),
-                                "date-crawled": dt.utcnow().strftime(
-                                    "%Y-%m-%dT%H:%M:%S.%f"
-                                )
-                                + "Z",
                                "status": "success",
-                                "interestingKeywords": list(
-                                    set(self.interesting.findall(result))
-                                ),
+                                "interestingKeywords": Counter(self.interesting.findall(result)),
                            }
                        else:
                            index = {
                                "HTML": result,
                                "title": None,
                                "language": None,
-                                "date-crawled": dt.utcnow().strftime(
-                                    "%Y-%m-%dT%H:%M:%S.%f"
-                                )
-                                + "Z",
                                "status": "success",
-                                "interestingKeywords": list(
-                                    set(self.interesting.findall(result))
-                                ),
+                                "interestingKeywords": Counter(self.interesting.findall(result)),
                            }
-                        return self.response(index, onion, self.plugin_name)
+                        return self.response("success", index)
+
            except requests.exceptions.ConnectionError as connection_error:
                self.logger.error(f"Failed connecting to http://{url}")
                self.logger.debug(connection_error)
@ -131,10 +114,10 @@ class Plugin(Operator):
            self.renew_connection()
            if retry > self.retries:
                self.logger.error("[x] Max retries exceeded")
-                return self.response({"status": "failure"}, onion, self.plugin_name)
+                return self.response("failed", None)

-    def handle_onion(self, db, onion):
-        content = self.run_sessions(onion)
-        if content[self.plugin_name]["status"] == "success":
-            if self._onion_is_allowed(content, db, "HTML"):
-                self.es.update(db["_id"], content)
+    def handle_onion(self, onion):
+        html = self.run_sessions(onion.url)
+        if html['status'] == 'success':
+            self._onion_is_allowed(html['content']['HTML'], onion)
+        onion.simpleHTML(html)
--- a/onioningestor/operators/onionscan.py
+++ b/onioningestor/operators/onionscan.py
@ -1,28 +1,14 @@
-import re
-import os
-import sys
 import json
 import time
-import random
 import traceback
 import subprocess
-from uuid import uuid4
-from pathlib import Path
-from datetime import datetime as dt
+from threading import Timer
 from json.decoder import JSONDecodeError
 from concurrent.futures import ProcessPoolExecutor
-from threading import Timer

 import requests

-from stem.control import Controller
-from stem import Signal
-
-from selenium import webdriver
-from selenium.webdriver.firefox.options import Options
-from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
-
-from onionscraper.operators import Operator
+from onioningestor.operators import Operator


 class Plugin(Operator):
@ -32,41 +18,13 @@ class Plugin(Operator):
    sending artifacts to operators.
    """
    def __init__(self, logger, **kwargs):
+        self.name = kwargs['name']
        self.logger = logger
-        self.logger.info('Initializing OnionScanner')
-        screenshots = kwargs.pop('screenshots_path', None)
-        if screenshots:
-            self.screenshots = Path(screenshots)
-        else:
-            self.screenshots = Path(__file__).parents[1]/'screenshots'
+        self.logger.info(f'Initializing {self.name}')
        self.onionscan = kwargs['binpath']
-        self.timeout = int(kwargs['timeout'])
-        self.proxy = kwargs['socks5']
-        self.torControl = kwargs['TorController']
-        self.retries = int(kwargs['retries'])
-        self.headers ={
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language':'en-US,en;q=0.5',
-            'DNT': '1', 'Connection':
-            'keep-alive',
-            'Upgrade-Insecure-Requests': '1'
-        }
-
-        blacklist = kwargs['blacklist'].split(',')
-        self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
-        keywords = kwargs['interestingKeywords'].split(',')
-        self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
-        self.session = self.get_tor_session()
-
-    def response(self, status, content, onion):
-        """
-        status: success/failure
-        content: dict
-        onion: str
-        return: dict
-        """
-        return {'status': status, 'data': content, 'onion': onion}
+        self.timeout = int(kwargs.get('timeout', 300))
+        self.torControl = 9051
+        self.torControl = "Zue5a29v4xE6FciWpPF93rR2M2T"

    def parseDoc(self, data):
        data['onionscan'].pop('simpleReport', None)
@ -75,74 +33,17 @@ class Plugin(Operator):
        data['onionscan']['crawls'] = [*crawls]
        data['hiddenService'] = hiddenService
        for onion in crawls.keys():
-            print(onion)
-            with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
-                fp.write("%s\n" % onion)
-            #q.enqueue(self.crawl, onion)
-        #with open('test.json', 'w', encoding='utf-8') as f:
-        #    json.dump(data, f, ensure_ascii=False, indent=4)
+            self.queueCrawl((
+                3,
+                self.onion(
+                    url=onion,
+                    source='crawled',
+                    type='domain',
+                    status='offline',
+                    monitor=False,
+                    denylist=False)))
        return data

-    def format_directory(self, directory):
-        d = dt.now()
-        year = str(d.year)
-        month = str(d.month)
-        # prefix month and day with "0" if it is only one digit
-        if len(month) < 2:
-                month = "0" + month
-        day = str(d.day)
-        if len(day) < 2:
-                day = "0" + day
-        save_path = directory/year/month/day
-        if not os.path.isdir(save_path):
-            self.logger.info("[*] Creating directory to save screenshots")
-            os.makedirs(save_path)
-
-        return save_path
-
-    def take_screenshot(self, save_path, onion):
-        binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
-        fp = webdriver.FirefoxProfile()
-        fp.set_preference('network.proxy.type', 1)
-        fp.set_preference('network.proxy.socks', '127.0.0.1')
-        fp.set_preference('network.proxy.socks_port', 9050)
-        fp.set_preference('network.proxy.socks_remote_dns', True)
-
-        options = Options()
-        options.headless = True
-        driver = webdriver.Firefox(
-            executable_path='/home/tony/Projects/OnionScraper/geckodriver',
-            options=options,
-            firefox_profile=fp
-        )
-        url = 'http://' + onion
-        driver.get(url)
-        uid = str(uuid4()).split('-')[0]
-        filename = f"{onion}_screenshot_{uid}.png"
-        f_name = f"{save_path}/{filename}"
-        driver.save_screenshot(f_name)
-
-        driver.quit()
-
-        if os.path.isfile(f_name):
-            self.logger.info(f'[*] Screenshot was taken. {f_name}')
-            dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
-            result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
-            return self.response("success",result,onion)
-        else:
-            self.logger.error('[x] Unable to take screenshot')
-            return self.response("failure",None,onion)
-
-    def get_tor_session(self):
-        try:
-            s = requests.session()
-            s.proxies = self.proxy
-            s.headers.update(self.headers)
-        except Exception as e:
-            self.logger.error(e)
-            self.logger.debug(traceback.print_exc())
-        return s
-
    # signal TOR for a new connection
    def renew_connection(self):
        with Controller.from_port(port = self.torControl['port']) as controller:
@ -154,8 +55,7 @@ class Plugin(Operator):
            controller.signal(Signal.NEWNYM)
            # wait for the new identity to be initialized
            time.sleep(controller.get_newnym_wait())
-            session = self.get_tor_session()
-            self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
+            self.logger.info(f"IP is {requests.get('http://httpbin.org/ip').json()['origin']}")

    def handle_timeout(self, process, onion):
        #
@ -171,33 +71,6 @@ class Plugin(Operator):
        self.renew_connection()
        return

-    def run_sessions(self, onion):
-        retry = 0
-        result = None
-        while True:
-            try:
-                url = 'http://'+onion
-                self.logger.info(url)
-                content = self.session.get(url)
-                if content.status_code == 200:
-                    result = content.json()
-            except JSONDecodeError as e:
-                self.logger.debug(f'JSONDecodeError {e}')
-                result = content.text
-            except Exception as e:
-                self.logger.error(e)
-                self.logger.debug(traceback.print_exc())
-            finally:
-                if result:
-                    return self.response("success",result,onion)
-                else:
-                    self.logger.info('[x] No results found retrying ...')
-                    retry += 1
-                    self.renew_connection()
-            if retry > self.retries:
-                self.logger.error('[x] Max retries exceeded')
-                return self.response("failure",None, onion)
-
    def run_onionscan(self, onion):
        self.logger.info("[*] Running onionscan on %s", onion)

@ -205,7 +78,7 @@ class Plugin(Operator):
        process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)

        # start the timer and let it run till timeout minutes
-        process_timer = Timer(300,self.handle_timeout,args=[process,onion])
+        process_timer = Timer(self.timeout,self.handle_timeout,args=[process,onion])
        process_timer.start()

        # wait for the onion scan results
@ -215,47 +88,23 @@ class Plugin(Operator):
        if process_timer.is_alive():
            process_timer.cancel()
            try:
-                return self.response("success",json.loads(stdout),onion)
+                return self.response(
+                        "success",
+                        self.parseDoc(json.loads(stdout)))
            except json.decoder.JSONDecodeError:
-                pass
+                return self.response(
+                        "success",
+                        self.parseDoc(stdout))

        self.logger.info("[!!!] Process timed out for %s", onion)
+        return self.response("failed",stdout)

-        return self.response("failure",None, onion)
-
-    def handle_onion(self, onion_tuple):
-        onion = onion_tuple.url
-        self.logger.info(f'Processing {onion} with onionscan')
+    def handle_onion(self, onion):
        try:
-            blacklist_URL = self.blacklist.search(onion)
-            if blacklist_URL:
-                self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
-            else:
-                self.logger.debug("[*] URL blacklist test: PASSED")
-                results = self.run_onionscan(onion)
-                if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
-                    content = self.run_sessions(onion)
-                    if content['status'] == 'success':
-                        blacklist_CONTENT = self.blacklist.search(content['data'])
-                        if blacklist_CONTENT:
-                            self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
-                        else:
-                            self.logger.debug("[*] CONTENT blacklist test: PASSED")
-                            screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
-                            self.logger.info("Indexing!")
-                            doc = {
-                                    'onionscan':json.loads(results['data']),
-                                    'html':content['data'],
-                                    'screenshots':screenshot['data'],
-                                    'interestingKeywords':self.interestingKeywords.findall(content['data'])
-                                    }
-                            return self.parseDoc(doc)
-
-                else:
-                    self.logger.info(f"[x] hidden service {onion} is not active")
+            results = self.run_onionscan(onion.url)
+            onion.onionscan(results)
        except Exception as e:
            self.logger.error(e)
            self.logger.error(traceback.print_exc())
        finally:
            pass
-            #sys.exit(0)
--- a/onioningestor/operators/screenshot.py
+++ b/onioningestor/operators/screenshot.py
@ -0,0 +1,261 @@
+import re
+import os
+import sys
+import json
+import time
+import random
+import traceback
+import subprocess
+from uuid import uuid4
+from pathlib import Path
+from datetime import datetime as dt
+from json.decoder import JSONDecodeError
+from concurrent.futures import ProcessPoolExecutor
+from threading import Timer
+
+import requests
+
+from stem.control import Controller
+from stem import Signal
+
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+
+from onioningestor.operators import Operator
+
+
+class Plugin(Operator):
+    """OnionScraper main work logic.
+
+    Handles reading the config file, calling sources, maintaining state and
+    sending artifacts to operators.
+    """
+    def __init__(self, logger, **kwargs):
+        self.logger = logger
+        self.logger.info('Initializing OnionScanner')
+        screenshots = kwargs.pop('screenshots_path', None)
+        if screenshots:
+            self.screenshots = Path(screenshots)
+        else:
+            self.screenshots = Path(__file__).parents[1]/'screenshots'
+        self.onionscan = kwargs['binpath']
+        self.timeout = int(kwargs['timeout'])
+        self.proxy = kwargs['socks5']
+        self.torControl = kwargs['TorController']
+        self.retries = int(kwargs['retries'])
+        self.headers ={
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language':'en-US,en;q=0.5',
+            'DNT': '1', 'Connection':
+            'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
+        }
+
+        blacklist = kwargs['blacklist'].split(',')
+        self.blacklist = re.compile('|'.join([re.escape(word) for word in blacklist]), re.IGNORECASE)
+        keywords = kwargs['interestingKeywords'].split(',')
+        self.keywords = re.compile('|'.join([re.escape(word) for word in keywords]), re.IGNORECASE)
+        self.session = self.get_tor_session()
+
+    def response(self, status, content, onion):
+        """
+        status: success/failure
+        content: dict
+        onion: str
+        return: dict
+        """
+        return {'status': status, 'data': content, 'onion': onion}
+
+    def parseDoc(self, data):
+        data['onionscan'].pop('simpleReport', None)
+        crawls = data['onionscan'].pop('crawls', None)
+        hiddenService = data['onionscan'].pop('hiddenService', None)
+        data['onionscan']['crawls'] = [*crawls]
+        data['hiddenService'] = hiddenService
+        for onion in crawls.keys():
+            print(onion)
+            with open('/home/tony/Projects/OnionScraper_v2/onion_master_list.txt', 'a') as fp:
+                fp.write("%s\n" % onion)
+            #q.enqueue(self.crawl, onion)
+        #with open('test.json', 'w', encoding='utf-8') as f:
+        #    json.dump(data, f, ensure_ascii=False, indent=4)
+        return data
+
+    def format_directory(self, directory):
+        d = dt.now()
+        year = str(d.year)
+        month = str(d.month)
+        # prefix month and day with "0" if it is only one digit
+        if len(month) < 2:
+                month = "0" + month
+        day = str(d.day)
+        if len(day) < 2:
+                day = "0" + day
+        save_path = directory/year/month/day
+        if not os.path.isdir(save_path):
+            self.logger.info("[*] Creating directory to save screenshots")
+            os.makedirs(save_path)
+
+        return save_path
+
+    def take_screenshot(self, save_path, onion):
+        binary = FirefoxBinary('/home/tony/Projects/OnionScraper/geckodriver')
+        fp = webdriver.FirefoxProfile()
+        fp.set_preference('network.proxy.type', 1)
+        fp.set_preference('network.proxy.socks', '127.0.0.1')
+        fp.set_preference('network.proxy.socks_port', 9050)
+        fp.set_preference('network.proxy.socks_remote_dns', True)
+
+        options = Options()
+        options.headless = True
+        driver = webdriver.Firefox(
+            executable_path='/home/tony/Projects/OnionScraper/geckodriver',
+            options=options,
+            firefox_profile=fp
+        )
+        url = 'http://' + onion
+        driver.get(url)
+        uid = str(uuid4()).split('-')[0]
+        filename = f"{onion}_screenshot_{uid}.png"
+        f_name = f"{save_path}/{filename}"
+        driver.save_screenshot(f_name)
+
+        driver.quit()
+
+        if os.path.isfile(f_name):
+            self.logger.info(f'[*] Screenshot was taken. {f_name}')
+            dateScreenshoted = dt.utcnow().strftime('%Y-%m-%dT%H:%M:%S.%f')+ 'Z'
+            result = {'dateScreenshoted':dateScreenshoted,'filename':filename}
+            return self.response("success",result,onion)
+        else:
+            self.logger.error('[x] Unable to take screenshot')
+            return self.response("failure",None,onion)
+
+    def get_tor_session(self):
+        try:
+            s = requests.session()
+            s.proxies = self.proxy
+            s.headers.update(self.headers)
+        except Exception as e:
+            self.logger.error(e)
+            self.logger.debug(traceback.print_exc())
+        return s
+
+    # signal TOR for a new connection
+    def renew_connection(self):
+        with Controller.from_port(port = self.torControl['port']) as controller:
+            # Now we switch TOR identities to make sure we have a good connection
+            self.logger.info('Getting new Tor IP')
+            # authenticate to our local TOR controller
+            controller.authenticate(self.torControl['password'])
+            # send the signal for a new identity
+            controller.signal(Signal.NEWNYM)
+            # wait for the new identity to be initialized
+            time.sleep(controller.get_newnym_wait())
+            session = self.get_tor_session()
+            self.logger.info(f"IP is {session.get('http://httpbin.org/ip').json()['origin']}")
+
+    def handle_timeout(self, process, onion):
+        #
+        # Handle a timeout from the onionscan process.
+        #
+
+        try:
+            # kill the onionscan process
+            process.kill()
+            self.logger.info("[!!!] Killed the onionscan process.")
+        except:
+            pass
+        self.renew_connection()
+        return
+
+    def run_sessions(self, onion):
+        retry = 0
+        result = None
+        while True:
+            try:
+                url = 'http://'+onion
+                self.logger.info(url)
+                content = self.session.get(url)
+                if content.status_code == 200:
+                    result = content.json()
+            except JSONDecodeError as e:
+                self.logger.debug(f'JSONDecodeError {e}')
+                result = content.text
+            except Exception as e:
+                self.logger.error(e)
+                self.logger.debug(traceback.print_exc())
+            finally:
+                if result:
+                    return self.response("success",result,onion)
+                else:
+                    self.logger.info('[x] No results found retrying ...')
+                    retry += 1
+                    self.renew_connection()
+            if retry > self.retries:
+                self.logger.error('[x] Max retries exceeded')
+                return self.response("failure",None, onion)
+
+    def run_onionscan(self, onion):
+        self.logger.info("[*] Running onionscan on %s", onion)
+
+        # fire up onionscan
+        process = subprocess.Popen([self.onionscan,"--webport=0","--jsonReport","--simpleReport=false",onion],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+
+        # start the timer and let it run till timeout minutes
+        process_timer = Timer(300,self.handle_timeout,args=[process,onion])
+        process_timer.start()
+
+        # wait for the onion scan results
+        stdout = process.communicate()[0]
+
+        # we have received valid results so we can kill the timer
+        if process_timer.is_alive():
+            process_timer.cancel()
+            try:
+                return self.response("success",json.loads(stdout),onion)
+            except json.decoder.JSONDecodeError:
+                pass
+
+        self.logger.info("[!!!] Process timed out for %s", onion)
+
+        return self.response("failure",None, onion)
+
+    def handle_onion(self, onion_tuple):
+        onion = onion_tuple.url
+        self.logger.info(f'Processing {onion} with onionscan')
+        try:
+            blacklist_URL = self.blacklist.search(onion)
+            if blacklist_URL:
+                self.logger.info(f"[X] Blocked by blacklist => matched keyword {blacklist_URL.group()}")
+            else:
+                self.logger.debug("[*] URL blacklist test: PASSED")
+                results = self.run_onionscan(onion)
+                if results['status'] == 'success':# and results['data']['webDetected'] == 'true':
+                    content = self.run_sessions(onion)
+                    if content['status'] == 'success':
+                        blacklist_CONTENT = self.blacklist.search(content['data'])
+                        if blacklist_CONTENT:
+                            self.logger.info(f"[X] Blocked by blacklist content => matched keyword {blacklist_CONTENT.group()}")
+                        else:
+                            self.logger.debug("[*] CONTENT blacklist test: PASSED")
+                            screenshot = self.take_screenshot(self.format_directory(self.screenshots), onion)
+                            self.logger.info("Indexing!")
+                            doc = {
+                                    'onionscan':json.loads(results['data']),
+                                    'html':content['data'],
+                                    'screenshots':screenshot['data'],
+                                    'interestingKeywords':self.interestingKeywords.findall(content['data'])
+                                    }
+                            return self.parseDoc(doc)
+
+                else:
+                    self.logger.info(f"[x] hidden service {onion} is not active")
+        except Exception as e:
+            self.logger.error(e)
+            self.logger.error(traceback.print_exc())
+        finally:
+            pass
+            #sys.exit(0)
--- a/onioningestor/operators/yara.py
+++ b/onioningestor/operators/yara.py
@ -1,5 +1,5 @@

-from onionscraper.operators import Operator
+from onioningestor.operators import Operator


 class Plugin(Operator):
--- a/onioningestor/sources/init.py
+++ b/onioningestor/sources/init.py
@ -1,4 +1,5 @@
 from collections import namedtuple
+from onioningestor.onion import Onion

 class Source(object):
    """Base class for all Source plugins.
@ -7,14 +8,16 @@ class Source(object):
    additional methods to child classes, consider prefixing the method name
    with an underscore to denote a ``_private_method``.
    """
-    def __init__(self, name, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
        """Override this constructor in child classes.
        The first argument must always be ``name``.
        Other argumentss should be url, auth, etc, whatever is needed to set
        up the object.
        """
-        self.onion = namedtuple('onion', ['url','source','type'])
+        self.onion = Onion

+    def set_onionQueue(self, queue):
+        self.onionQueue = queue

    def run(self):
        """Run and return ``(saved_state, list(Artifact))``.
@ -28,14 +31,3 @@ class Source(object):
        """
        raise NotImplementedError()

-
-    def process_element(self, content, reference_link, include_nonobfuscated=False):
-        """Take a single source content/url and return a list of Artifacts.
-        This is the main work block of Source plugins, which handles
-        IOC extraction and artifact creation.
-        :param content: String content to extract from.
-        :param reference_link: Reference link to attach to all artifacts.
-        :param include_nonobfuscated: Include non-defanged URLs in output?
-        """
-        logger.debug(f"Processing in source '{self.name}'")
-
--- a/onioningestor/sources/gist.py
+++ b/onioningestor/sources/gist.py
@ -17,7 +17,7 @@ import time
 from bs4 import BeautifulSoup


-from onionscraper.sources import Source
+from onioningestor.sources import Source


 class Plugin(Source):
--- a/onioningestor/sources/gmail.py
+++ b/onioningestor/sources/gmail.py
@ -17,7 +17,7 @@ import time
 from bs4 import BeautifulSoup


-from onionscraper.sources import Source
+from onioningestor.sources import Source


 class Plugin(Source):
--- a/onioningestor/sources/simplefile.py
+++ b/onioningestor/sources/simplefile.py
@ -16,7 +16,7 @@ from onioningestor.sources import Source

 class Plugin(Source):

-    def __init__(self, logger, name, filename):
+    def __init__(self, logger, name, filename, **kwargs):
        self.logger = logger
        self.name = name
        self.filename = filename
@ -28,10 +28,17 @@ class Plugin(Source):
        filepath = Path(__file__).parents[2]/self.filename
        with open(filepath, 'r') as fp:
            lines = fp.read().splitlines()
-        # just testing
-        os.remove(self.filename)
        for onion in lines:
-            items.append(self.onion(url=onion,source='simple-file',type='domain'))
-            #yield self.onion(url=onion,source='simple-file',type='domain')
-        return items
+            self.onionQueue.put(
+                    (
+                        2,
+                        self.onion(
+                            url=onion,
+                            source=self.name,
+                            type='domain',
+                            status='offline',
+                            monitor=False,
+                            denylist=False)
+                        )
+                    )

--- a/requirements/prod.txt
+++ b/requirements/prod.txt
@ -4,11 +4,14 @@ chardet==3.0.4
 click==7.1.2
 elasticsearch==7.8.0
 idna==2.10
+langdetect==1.0.8
 lxml==4.5.1
 PySocks==1.7.1
 PyYAML==5.3.1
 requests==2.24.0
+schedule==0.6.0
 selenium==3.141.0
+six==1.15.0
 soupsieve==2.0.1
 stem==1.8.0
 urllib3==1.25.9