mirror of
https://github.com/danieleperera/OnionIngestor
synced 2024-11-15 18:12:56 +00:00
154 lines
5.1 KiB
Python
154 lines
5.1 KiB
Python
|
#!/usr/bin/python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
__author__ = 'Daneiele Perera'
|
||
|
__license__ = "MIT"
|
||
|
__version__ = "1.0.1"
|
||
|
__maintainer__ = "Daniele Perera"
|
||
|
__status__ = "Development"
|
||
|
|
||
|
import requests
|
||
|
import json
|
||
|
import re
|
||
|
import re
|
||
|
import urllib.parse
|
||
|
from random import choice
|
||
|
import time
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
|
||
|
from onionscraper.sources import Source
|
||
|
|
||
|
|
||
|
class Plugin(Source):
|
||
|
|
||
|
def __init__(self, logger, name, url):
|
||
|
self.logger = logger
|
||
|
self.name = name
|
||
|
self.url = url
|
||
|
self.desktop_agents = [
|
||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||
|
super().__init__(self)
|
||
|
|
||
|
|
||
|
def run(self):
|
||
|
self.logger.info('Starting Gist Scraper')
|
||
|
self.cookies()
|
||
|
self.pagination()
|
||
|
self.scraping()
|
||
|
return self.raw()
|
||
|
|
||
|
@property
|
||
|
def random_headers(self):
|
||
|
return {
|
||
|
'User-Agent': choice(self.desktop_agents),
|
||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||
|
}
|
||
|
|
||
|
def cookies(self):
|
||
|
|
||
|
self.logger.info('Setting GIST cookies')
|
||
|
|
||
|
with requests.Session() as self.session:
|
||
|
self.headers = self.random_headers
|
||
|
|
||
|
request = self.session.get(self.url, headers=self.headers)
|
||
|
|
||
|
if request.status_code == 200:
|
||
|
pass
|
||
|
else:
|
||
|
self.logger.error('No Response from GIST')
|
||
|
|
||
|
def pagination(self):
|
||
|
request = self.session.get(
|
||
|
f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
|
||
|
self.soup = BeautifulSoup(request.content, features="lxml")
|
||
|
|
||
|
pages = []
|
||
|
self.urls = [self.url]
|
||
|
try:
|
||
|
for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
|
||
|
pages.append(pagination.get_text())
|
||
|
except:
|
||
|
pages = False
|
||
|
|
||
|
if pages:
|
||
|
cont = 2
|
||
|
while cont <= 1: # int(pages[-2]):
|
||
|
cont += 1
|
||
|
full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
|
||
|
self.urls.append(full_url)
|
||
|
|
||
|
def scraping(self):
|
||
|
url = []
|
||
|
for inurl in self.urls:
|
||
|
self.logger.info(f"Connecting to {inurl}")
|
||
|
time.sleep(5)
|
||
|
request = self.session.get(inurl, headers=self.headers)
|
||
|
|
||
|
if request.status_code == 200:
|
||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||
|
for code in soup.findAll('div', {'class': 'gist-snippet'}):
|
||
|
if '.onion' in code.get_text().lower():
|
||
|
for raw in code.findAll('a', {'class': 'link-overlay'}):
|
||
|
try:
|
||
|
url.append(raw['href'])
|
||
|
except:
|
||
|
pass
|
||
|
self.urls_raw = []
|
||
|
for get in url:
|
||
|
self.logger.info(f"Connecting to {get}")
|
||
|
time.sleep(5)
|
||
|
try:
|
||
|
request = self.session.get(get, headers=self.headers)
|
||
|
|
||
|
if request.status_code == 200:
|
||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||
|
|
||
|
for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
|
||
|
try:
|
||
|
gist_url = f"https://gist.githubusercontent.com{raw['href']}"
|
||
|
|
||
|
self.urls_raw.append(gist_url)
|
||
|
|
||
|
except:
|
||
|
pass
|
||
|
except(requests.exceptions.ConnectionError,
|
||
|
requests.exceptions.ChunkedEncodingError,
|
||
|
requests.exceptions.ReadTimeout,
|
||
|
requests.exceptions.InvalidURL) as e:
|
||
|
self.logger.error(
|
||
|
f"I was unable to connect to the url, because an error occurred.\n{e}")
|
||
|
pass
|
||
|
|
||
|
def raw(self):
|
||
|
self.logger.info('Performing replaces and regex. WAIT...')
|
||
|
itens = []
|
||
|
onions = []
|
||
|
for raw in self.urls_raw:
|
||
|
if '.txt' in raw.lower() \
|
||
|
or '.csv' in raw.lower():
|
||
|
time.sleep(5)
|
||
|
request = self.session.get(raw, headers=self.headers)
|
||
|
self.soup = BeautifulSoup(request.content, features="lxml")
|
||
|
for pre in self.soup.findAll('body'):
|
||
|
list = pre.get_text().split('\n')
|
||
|
itens.extend(list)
|
||
|
|
||
|
regex = re.compile(
|
||
|
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||
|
|
||
|
for lines in itens:
|
||
|
rurls = lines \
|
||
|
.replace('\xad', '') \
|
||
|
.replace('\n', '') \
|
||
|
.replace("http://", '') \
|
||
|
.replace("https://", '') \
|
||
|
.replace("www.", "")
|
||
|
|
||
|
url = regex.match(rurls)
|
||
|
|
||
|
if url is not None:
|
||
|
onions.append(self.onion(url=url.group(), source='gist', type='domain'))
|
||
|
return onions
|