fixed some major issues

pull/6/head
danieleperera 4 years ago
parent f593c7366d
commit da4ac91b6c

@ -4,7 +4,7 @@ import queue
import traceback import traceback
import threading import threading
import collections import collections
from queue import Queue from queue import Queue, Empty
from itertools import islice from itertools import islice
from . import config from . import config
@ -71,92 +71,77 @@ class Ingestor:
self.logger.debug(traceback.print_exc()) self.logger.debug(traceback.print_exc())
sys.exit(1) sys.exit(1)
def iter_batches(self, data, batch_size): def collect_sources(self):
data = iter(data) self.logger.debug("Initializing sources")
while True: for name, collect, kwargs in self.config.sources():
batch = list(islice(data, batch_size)) # Run the source to collect onion links from clear net.
if len(batch) == 0: self.logger.info(f"Running source '{name}'")
break try:
yield batch # get the generator of onions
source = collect(self.logger, **kwargs)
def process(self, onions): source.set_onionQueue(self.queue) #priority 2
t = source.run()
self.threads.append(t)
#self.logger.info(f'Starting of thread: {t.currentThread().name}')
#t.start()
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue
def process(self, onion):
for operator in self.operators: for operator in self.operators:
self.logger.info(f"Processing found onions with operator '{operator}'") self.logger.info(f"Processing found onions with operator '{operator}'")
# Set CrawlQueue for every operator # Set CrawlQueue for every operator
self.operators[operator].set_crawlQueue(self.queue) self.operators[operator].set_crawlQueue(self.queue)
# Process list of onions # Process list of onions
self.operators[operator].process(onions) self.operators[operator].process(onion)
def run(self): def run(self):
"""Run once, or forever, depending on config.""" """Run once, or forever, depending on config."""
self.run_once() if self.config.daemon():
#if self.config.daemon(): self.logger.info("Running forever, in a loop")
# self.logger.info("Running forever, in a loop") self.run_forever()
# self.run_forever() else:
#else: self.logger.info("Running once, to completion")
# self.logger.info("Running once, to completion") self.run_once()
# self.run_once()
def run_once(self): def run_once(self):
"""Run each source once, passing artifacts to each operator.""" """Run each source once, passing artifacts to each operator."""
# Start collecting sources # Start collecting sources
self.collect_sources() # self.collect_sources()
# Sources will fill various queues # Sources will fill various queues
# MonitorQueue has priority high # MonitorQueue has priority high
# OnionQueue are those found in clearnet medium # OnionQueue are those found in clearnet medium
# crawlQueue are those found crawling onionlinks low # crawlQueue are those found crawling onionlinks low
onions = list(self.queue.queue)
done = False done = False
if onions: while not done:
while not done: try:
try: onion = self.queue.get(True, 5)
## Process onions with each operator. ## Process onions with each operator.
for batched_onions in self.iter_batches(onions, batch_size=10): self.process(onion)
self.process(batched_onions) ## Save Onions for each storage
## Save Onions for each storage self.storage.save_pastie(onion[1], 30)
for onion in batched_onions: except Empty:
self.storage.save_pastie(onion[1], 30) self.logger.info('Queue is empty')
done = True done = True
except Exception as e: except Exception as e:
self.logger.error(e) self.logger.error(e)
self.logger.error(traceback.print_exc()) self.logger.error(traceback.print_exc())
break break
except KeyboardInterrupt: except KeyboardInterrupt:
print('') print('')
self.logger.info("Ctrl-c received! Sending kill to threads...") self.logger.info("Ctrl-c received! Sending kill to threads...")
for t in self.threads: for t in self.threads:
t.kill_received = True t.kill_received = True
self.logger.info('Exiting') self.logger.info('Exiting')
sys.exit(0) sys.exit(0)
else:
for t in self.threads:
t.kill_received = True
self.logger.info(f"Sleeping for {self.config.sleep()} seconds")
time.sleep(self.config.sleep())
def run_forever(self): def run_forever(self):
"""Run forever, sleeping for the configured interval between each run.""" """Run forever, sleeping for the configured interval between each run."""
while True: while True:
self.run_once() self.run_once()
self.logger.debug(f"Sleeping for {self.config.sleep()} seconds")
def collect_sources(self): time.sleep(self.config.sleep())
self.logger.debug("Initializing sources")
for name, collect, kwargs in self.config.sources():
# Run the source to collect onion links from clear net.
self.logger.info(f"Running source '{name}'")
try:
# get the generator of onions
source = collect(self.logger, **kwargs)
source.set_onionQueue(self.queue) #priority 2
t = source.run()
self.threads.append(t)
#self.logger.info(f'Starting of thread: {t.currentThread().name}')
#t.start()
except Exception as e:
self.logger.error(e)
self.logger.error(traceback.print_exc())
continue

@ -60,9 +60,9 @@ class Config:
return self.config["general"]["TorController"] return self.config["general"]["TorController"]
def monitorQueue(self): def monitorQueue(self):
fp = self.config["monitor"].get("filename", False) fp = Path(self.config["monitor"].get("filename", "this_File_Does_notExsit"))
q = PriorityQueue(maxsize=0) q = PriorityQueue(maxsize=0)
if fp: if fp.is_file():
with open(fp, 'r') as f: with open(fp, 'r') as f:
monitorOnions = f.read().splitlines() monitorOnions = f.read().splitlines()
for monitor in monitorOnions: for monitor in monitorOnions:
@ -77,7 +77,7 @@ class Config:
denylist=False))) denylist=False)))
return q return q
else: else:
return None return q
def logging(self): def logging(self):
"""Returns logging config dictionary.""" """Returns logging config dictionary."""

@ -115,16 +115,9 @@ class Operator:
monitor=False, monitor=False,
denylist=False))) denylist=False)))
def collect(self, onions): def process(self, onion):
for onion in onions:
self.logger.info(f'thread function processing {onion[1]}')
self.handle_onion(onion[1])
def process(self, onions):
"""Process all applicable onions.""" """Process all applicable onions."""
for onion in onions: self.handle_onion(onion[1])
self.handle_onion(onion[1])
#with ThreadPoolExecutor(max_workers=1) as executor: #with ThreadPoolExecutor(max_workers=1) as executor:
# collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)] # collect_tasks = [executor.submit(self.collect, files_batch) for files_batch in self.iter_batches(onions, batch_size=10)]
# for tasks in collect_tasks: # for tasks in collect_tasks:

@ -34,8 +34,8 @@ class Plugin(Operator):
hiddenService = data.pop('hiddenService', None) hiddenService = data.pop('hiddenService', None)
data['crawls'] = [*crawls] data['crawls'] = [*crawls]
try: try:
if data['linkedOnions']: if data['identifierReport'].get('linkedOnions', False):
self.findCrawls(data['linkedOnions'], hiddenService) self.findCrawls(data['identifierReport']['linkedOnions'], hiddenService)
except KeyError as e: except KeyError as e:
pass pass
return data return data

Loading…
Cancel
Save