OnionIngestor/onioningestor/sources/gmail.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-

__author__ = 'Daneiele Perera'
__license__ = "MIT"
__version__ = "1.0.1"
__maintainer__ = "Daniele Perera"
__status__ = "Development"

import requests
import json
import re
import re
import urllib.parse
from random import choice
import time
from bs4 import BeautifulSoup


from onionscraper.sources import Source


class Plugin(Source):

    def __init__(self, logger, name, url):
        self.logger = logger
        self.name = name
        self.url = url
        self.desktop_agents = [
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']
        super().__init__(self)


    def run(self):
        self.logger.info('Starting Gist Scraper')
        self.cookies()
        self.pagination()
        self.scraping()
        return self.raw()

    @property
    def random_headers(self):
        return {
            'User-Agent': choice(self.desktop_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }

    def cookies(self):

        self.logger.info('Setting GIST cookies')

        with requests.Session() as self.session:
            self.headers = self.random_headers

            request = self.session.get(self.url, headers=self.headers)

            if request.status_code == 200:
                pass
            else:
                self.logger.error('No Response from GIST')

    def pagination(self):
        request = self.session.get(
            f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)
        self.soup = BeautifulSoup(request.content, features="lxml")

        pages = []
        self.urls = [self.url]
        try:
            for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):
                pages.append(pagination.get_text())
        except:
            pages = False

        if pages:
            cont = 2
            while cont <= 1:  # int(pages[-2]):
                cont += 1
                full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"
                self.urls.append(full_url)

    def scraping(self):
        url = []
        for inurl in self.urls:
            self.logger.info(f"Connecting to {inurl}")
            time.sleep(5)
            request = self.session.get(inurl, headers=self.headers)

            if request.status_code == 200:
                soup = BeautifulSoup(request.content, features="lxml")
                for code in soup.findAll('div', {'class': 'gist-snippet'}):
                    if '.onion' in code.get_text().lower():
                        for raw in code.findAll('a', {'class': 'link-overlay'}):
                            try:
                                url.append(raw['href'])
                            except:
                                pass
            self.urls_raw = []
            for get in url:
                self.logger.info(f"Connecting to {get}")
                time.sleep(5)
                try:
                    request = self.session.get(get, headers=self.headers)

                    if request.status_code == 200:
                        soup = BeautifulSoup(request.content, features="lxml")

                        for raw in soup.findAll('a', {'class': 'btn btn-sm'}):
                            try:
                                gist_url = f"https://gist.githubusercontent.com{raw['href']}"

                                self.urls_raw.append(gist_url)

                            except:
                                pass
                except(requests.exceptions.ConnectionError,
                       requests.exceptions.ChunkedEncodingError,
                       requests.exceptions.ReadTimeout,
                       requests.exceptions.InvalidURL) as e:
                    self.logger.error(
                        f"I was unable to connect to the url, because an error occurred.\n{e}")
                    pass

    def raw(self):
        self.logger.info('Performing replaces and regex. WAIT...')
        itens = []
        onions = []
        for raw in self.urls_raw:
            if '.txt' in raw.lower() \
                    or '.csv' in raw.lower():
                time.sleep(5)
                request = self.session.get(raw, headers=self.headers)
                self.soup = BeautifulSoup(request.content, features="lxml")
                for pre in self.soup.findAll('body'):
                    list = pre.get_text().split('\n')
                    itens.extend(list)

                regex = re.compile(
                    "[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")

                for lines in itens:
                    rurls = lines \
                        .replace('\xad', '') \
                        .replace('\n', '') \
                        .replace("http://", '') \
                        .replace("https://", '') \
                        .replace("www.", "")

                    url = regex.match(rurls)

                    if url is not None:
                        onions.append(self.onion(url=url.group(), source='gist', type='domain'))
        return onions
renamed the package and fixed some issues 2020-07-06 22:43:35 +00:00			`#!/usr/bin/python3`
			`# -- coding: utf-8 --`

			`__author__ = 'Daneiele Perera'`
			`__license__ = "MIT"`
			`__version__ = "1.0.1"`
			`__maintainer__ = "Daniele Perera"`
			`__status__ = "Development"`

			`import requests`
			`import json`
			`import re`
			`import re`
			`import urllib.parse`
			`from random import choice`
			`import time`
			`from bs4 import BeautifulSoup`


			`from onionscraper.sources import Source`


			`class Plugin(Source):`

			`def __init__(self, logger, name, url):`
			`self.logger = logger`
			`self.name = name`
			`self.url = url`
			`self.desktop_agents = [`
			`'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0']`
			`super().__init__(self)`


			`def run(self):`
			`self.logger.info('Starting Gist Scraper')`
			`self.cookies()`
			`self.pagination()`
			`self.scraping()`
			`return self.raw()`

			`@property`
			`def random_headers(self):`
			`return {`
			`'User-Agent': choice(self.desktop_agents),`
			`'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'`
			`}`

			`def cookies(self):`

			`self.logger.info('Setting GIST cookies')`

			`with requests.Session() as self.session:`
			`self.headers = self.random_headers`

			`request = self.session.get(self.url, headers=self.headers)`

			`if request.status_code == 200:`
			`pass`
			`else:`
			`self.logger.error('No Response from GIST')`

			`def pagination(self):`
			`request = self.session.get(`
			`f"https://gist.github.com/search?l=Text&q={urllib.parse.quote('.onio')}", headers=self.headers)`
			`self.soup = BeautifulSoup(request.content, features="lxml")`

			`pages = []`
			`self.urls = [self.url]`
			`try:`
			`for pagination in self.soup.find('div', {'class': 'pagination'}).findAll('a'):`
			`pages.append(pagination.get_text())`
			`except:`
			`pages = False`

			`if pages:`
			`cont = 2`
			`while cont <= 1: # int(pages[-2]):`
			`cont += 1`
			`full_url = f"https://gist.github.com/search?l=Text&p={cont-1}&q={urllib.parse.quote('.onio')}"`
			`self.urls.append(full_url)`

			`def scraping(self):`
			`url = []`
			`for inurl in self.urls:`
			`self.logger.info(f"Connecting to {inurl}")`
			`time.sleep(5)`
			`request = self.session.get(inurl, headers=self.headers)`

			`if request.status_code == 200:`
			`soup = BeautifulSoup(request.content, features="lxml")`
			`for code in soup.findAll('div', {'class': 'gist-snippet'}):`
			`if '.onion' in code.get_text().lower():`
			`for raw in code.findAll('a', {'class': 'link-overlay'}):`
			`try:`
			`url.append(raw['href'])`
			`except:`
			`pass`
			`self.urls_raw = []`
			`for get in url:`
			`self.logger.info(f"Connecting to {get}")`
			`time.sleep(5)`
			`try:`
			`request = self.session.get(get, headers=self.headers)`

			`if request.status_code == 200:`
			`soup = BeautifulSoup(request.content, features="lxml")`

			`for raw in soup.findAll('a', {'class': 'btn btn-sm'}):`
			`try:`
			`gist_url = f"https://gist.githubusercontent.com{raw['href']}"`

			`self.urls_raw.append(gist_url)`

			`except:`
			`pass`
			`except(requests.exceptions.ConnectionError,`
			`requests.exceptions.ChunkedEncodingError,`
			`requests.exceptions.ReadTimeout,`
			`requests.exceptions.InvalidURL) as e:`
			`self.logger.error(`
			`f"I was unable to connect to the url, because an error occurred.\n{e}")`
			`pass`

			`def raw(self):`
			`self.logger.info('Performing replaces and regex. WAIT...')`
			`itens = []`
			`onions = []`
			`for raw in self.urls_raw:`
			`if '.txt' in raw.lower() \`
			`or '.csv' in raw.lower():`
			`time.sleep(5)`
			`request = self.session.get(raw, headers=self.headers)`
			`self.soup = BeautifulSoup(request.content, features="lxml")`
			`for pre in self.soup.findAll('body'):`
			`list = pre.get_text().split('\n')`
			`itens.extend(list)`

			`regex = re.compile(`
			`"[A-Za-z0-9]{0,12}\.?[A-Za-z0-9]{12,50}\.onion")`

			`for lines in itens:`
			`rurls = lines \`
			`.replace('\xad', '') \`
			`.replace('\n', '') \`
			`.replace("http://", '') \`
			`.replace("https://", '') \`
			`.replace("www.", "")`

			`url = regex.match(rurls)`

			`if url is not None:`
			`onions.append(self.onion(url=url.group(), source='gist', type='domain'))`
			`return onions`