You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
121 lines
4.3 KiB
Python
121 lines
4.3 KiB
Python
4 years ago
|
#!/usr/bin/python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
__author__ = 'Andrey Glauzer'
|
||
|
__license__ = "MIT"
|
||
|
__version__ = "1.0.1"
|
||
|
__maintainer__ = "Andrey Glauzer"
|
||
|
__status__ = "Development"
|
||
|
|
||
|
import requests
|
||
|
import json
|
||
|
import re
|
||
|
import logging
|
||
|
import re
|
||
|
import urllib.parse
|
||
|
from random import choice
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
|
||
|
class Reddit:
|
||
|
def __init__(self):
|
||
|
self.session = requests.session()
|
||
|
|
||
|
self.source = 'Reddit'
|
||
|
|
||
|
self.url = 'https://api.pushshift.io/reddit/search/comment/?subreddit=onions&limit=1000000'
|
||
|
self.desktop_agents = [
|
||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
|
||
|
|
||
|
@property
|
||
|
def random_headers(self):
|
||
|
return {
|
||
|
'User-Agent': choice(self.desktop_agents),
|
||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||
|
}
|
||
|
|
||
|
@property
|
||
|
def start(self):
|
||
|
self.reddit_json()
|
||
|
|
||
|
def reddit_json(self):
|
||
|
print('Getting Reddit API information')
|
||
|
onionurl = []
|
||
|
try:
|
||
|
request = self.session.get(self.url, headers=self.random_headers)
|
||
|
|
||
|
loaded_json = json.loads(request.content)
|
||
|
|
||
|
print(
|
||
|
'Filtering the URLs that have the word .onion in the text')
|
||
|
for data in loaded_json['data']:
|
||
|
reddit_url = 'https://www.reddit.com{}'.format(
|
||
|
data['permalink'])
|
||
|
try:
|
||
|
request = self.session.get(
|
||
|
reddit_url, headers=self.random_headers)
|
||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||
|
|
||
|
for raw in soup.findAll('a', {'rel': 'nofollow'}):
|
||
|
if 'https://' in raw['href']:
|
||
|
raw_text = self.raw(url=raw['href'])
|
||
|
if raw_text is not None:
|
||
|
print(
|
||
|
'Applying REGEX. Wait...')
|
||
|
regex = re.compile(
|
||
|
"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")
|
||
|
|
||
|
for lines in raw_text.split('\n'):
|
||
|
rurls = lines \
|
||
|
.replace('\xad', '') \
|
||
|
.replace('\n', '') \
|
||
|
.replace("http://", '') \
|
||
|
.replace("https://", '') \
|
||
|
.replace(r'\s', '') \
|
||
|
.replace('\t', '')
|
||
|
|
||
|
xurl = regex.match(rurls)
|
||
|
if xurl is not None:
|
||
|
onionurl.append(xurl.group())
|
||
|
|
||
|
except(requests.exceptions.ConnectionError,
|
||
|
requests.exceptions.ChunkedEncodingError,
|
||
|
requests.exceptions.ReadTimeout,
|
||
|
requests.exceptions.InvalidURL) as e:
|
||
|
print(
|
||
|
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
|
||
|
|
||
|
except(requests.exceptions.ConnectionError,
|
||
|
requests.exceptions.ChunkedEncodingError,
|
||
|
requests.exceptions.ReadTimeout,
|
||
|
requests.exceptions.InvalidURL) as e:
|
||
|
print(
|
||
|
'Não consegui conectar na url, porque ocorreu um erro.\n{e}'.format(e=e))
|
||
|
|
||
|
return onionurl
|
||
|
|
||
|
def raw(self, url):
|
||
|
try:
|
||
|
if url is not None:
|
||
|
request = self.session.get(url, headers=self.random_headers)
|
||
|
print(
|
||
|
'Connecting in {url} - {status}'.format(url=url, status=request.status_code))
|
||
|
|
||
|
if request.status_code == 200:
|
||
|
|
||
|
soup = BeautifulSoup(request.content, features="lxml")
|
||
|
for s in soup(['script', 'style']):
|
||
|
s.decompose()
|
||
|
|
||
|
return ' '.join(soup.stripped_strings)
|
||
|
|
||
|
except (requests.exceptions.ConnectionError,
|
||
|
requests.exceptions.ChunkedEncodingError,
|
||
|
requests.exceptions.ReadTimeout,
|
||
|
requests.exceptions.TooManyRedirects) as e:
|
||
|
pass
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
app = Reddit()
|
||
|
app.start
|