From 0ef098069e0592b62924196582359357b024b992 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 28 Oct 2020 20:47:42 -0400 Subject: [PATCH] Add tor and http/socks proxy support (#137) * Add tor and http/socks proxy support Allows users to enable/disable tor from the config menu, which will forward all requests through Tor. Also adds support for setting environment variables for alternative proxy support. Setting the following variables will forward requests through the proxy: - WHOOGLE_PROXY_USER (optional) - WHOOGLE_PROXY_PASS (optional) - WHOOGLE_PROXY_TYPE (required) - Can be "http", "socks4", or "socks5" - WHOOGLE_PROXY_LOC (required) - Format: ":" See #30 * Refactor acquire_tor_conn -> acquire_tor_identity Also updated travis CI to set up tor * Add check for Tor socket on init, improve Tor error handling Initializing the app sends a heartbeat request to Tor to check for availability, and updates the home page config options accordingly. This heartbeat is sent on every request, to ensure Tor support can be reconfigured without restarting the entire app. If Tor support is enabled, and a subsequent request fails, then a new TorError exception is raised, and the Tor feature is disabled until a valid connection is restored. The max attempts has been updated to 10, since 5 seemed a bit too low for how quickly the attempts go by. * Change send_tor_signal arg type, update function doc send_tor_signal now accepts a stem.Signal arg (a bit cleaner tbh). Also added the doc string for the "disable" attribute in TorError. * Fix tor identity logic in Request.send * Update proxy init, change proxyloc var name Proxy is now only initialized if both type and location are specified, as neither have a default fallback and both are required. I suppose the type could fall back to http, but seems safer this way. Also refactored proxyurl -> proxyloc for the runtime args in order to match the Dockerfile args. * Add tor/proxy support for Docker builds, fix opensearch/init The Dockerfile is now updated to include support for Tor configuration, with a working torrc file included in the repo. An issue with opensearch was fixed as well, which was uncovered during testing and was simple enough to fix here. Likewise, DDG bang gen was updated to only ever happen if the file didn't exist previously, as testing with the file being regenerated every time was tedious. * Add missing "@" for socks proxy requests --- Dockerfile | 19 +++++- README.md | 3 +- app/__init__.py | 10 ++- app/models/config.py | 1 + app/request.py | 126 ++++++++++++++++++++++++++++++++--- app/routes.py | 44 ++++++++++-- app/static/js/controller.js | 2 +- app/templates/error.html | 1 + app/templates/index.html | 10 +++ app/templates/opensearch.xml | 8 +-- app/utils/routing_utils.py | 10 ++- rc/torrc | 8 +++ requirements.txt | 17 ++++- 13 files changed, 228 insertions(+), 31 deletions(-) create mode 100644 rc/torrc diff --git a/Dockerfile b/Dockerfile index 61f77b2..0882bad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,13 @@ FROM python:3.8-slim WORKDIR /usr/src/app -RUN apt-get update && apt-get install -y build-essential libcurl4-openssl-dev libssl-dev +RUN apt-get update && apt-get install -y \ + build-essential \ + libcurl4-openssl-dev \ + libssl-dev \ + tor + +COPY rc/torrc /etc/tor/torrc COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt @@ -15,6 +21,15 @@ ENV WHOOGLE_USER=$username ARG password='' ENV WHOOGLE_PASS=$password +ARG proxyuser='' +ENV WHOOGLE_PROXY_USER=$proxyuser +ARG proxypass='' +ENV WHOOGLE_PROXY_PASS=$proxypass +ARG proxytype='' +ENV WHOOGLE_PROXY_TYPE=$proxytype +ARG proxyloc='' +ENV WHOOGLE_PROXY_LOC=$proxyloc + ARG use_https='' ENV HTTPS_ONLY=$use_https @@ -25,4 +40,4 @@ COPY . . EXPOSE $EXPOSE_PORT -CMD ["./run"] +CMD service tor start && ./run diff --git a/README.md b/README.md index 2547daf..699d613 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Contents - No AMP links - No URL tracking tags (i.e. utm=%s) - No referrer header +- Tor and HTTP/SOCKS proxy support - Autocomplete/search suggestions - POST request search and suggestion queries (when possible) - View images at full res without site redirect (currently mobile only) @@ -35,7 +36,7 @@ Contents - Optional location-based searching (i.e. results near \) - Optional NoJS mode to disable all Javascript in results -*If deployed to a remote server +*If deployed to a remote server, or configured to send requests through a VPN, Tor, proxy, etc. ## Dependencies If using Heroku Quick Deploy, **you can skip this section**. diff --git a/app/__init__.py b/app/__init__.py index 820edb0..a349acc 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,8 +1,10 @@ +from app.request import send_tor_signal from app.utils.session_utils import generate_user_keys from app.utils.gen_ddg_bangs import gen_bangs_json from flask import Flask from flask_session import Session import os +from stem import Signal app = Flask(__name__, static_folder=os.path.dirname(os.path.abspath(__file__)) + '/static') app.user_elements = {} @@ -25,11 +27,15 @@ if not os.path.exists(app.config['CONFIG_PATH']): if not os.path.exists(app.config['SESSION_FILE_DIR']): os.makedirs(app.config['SESSION_FILE_DIR']) -# (Re)generate DDG bang filter, and create path if it doesn't exist yet +# Generate DDG bang filter, and create path if it doesn't exist yet if not os.path.exists(app.config['BANG_PATH']): os.makedirs(app.config['BANG_PATH']) -gen_bangs_json(app.config['BANG_FILE']) +if not os.path.exists(app.config['BANG_FILE']): + gen_bangs_json(app.config['BANG_FILE']) Session(app) +# Attempt to acquire tor identity, to determine if Tor config is available +send_tor_signal(Signal.HEARTBEAT) + from app import routes diff --git a/app/models/config.py b/app/models/config.py index 2fb4088..6cfe18f 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -305,6 +305,7 @@ class Config: self.safe = False self.dark = False self.nojs = False + self.tor = False self.near = '' self.alts = False self.new_tab = False diff --git a/app/request.py b/app/request.py index 4abb9b3..04ae3db 100644 --- a/app/request.py +++ b/app/request.py @@ -1,8 +1,12 @@ +from app.models.config import Config from lxml import etree import random import requests -from requests import Response +from requests import Response, ConnectionError import urllib.parse as urlparse +import os +from stem import Signal, SocketError +from stem.control import Controller # Core Google search URLs SEARCH_URL = 'https://www.google.com/search?gbv=1&q=' @@ -15,7 +19,36 @@ DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr'] -def gen_user_agent(is_mobile): +class TorError(Exception): + """Exception raised for errors in Tor requests. + + Attributes: + message -- a message describing the error that occurred + disable -- optionally disables Tor in the user config (note: + this should only happen if the connection has been dropped + altogether). + """ + + def __init__(self, message, disable=False): + self.message = message + self.disable = disable + super().__init__(self.message) + + +def send_tor_signal(signal: Signal) -> bool: + try: + with Controller.from_port(port=9051) as c: + c.authenticate() + c.signal(signal) + os.environ['TOR_AVAILABLE'] = '1' + return True + except (SocketError, ConnectionRefusedError, ConnectionError): + os.environ['TOR_AVAILABLE'] = '0' + + return False + + +def gen_user_agent(is_mobile) -> str: mozilla = random.choice(['Moo', 'Woah', 'Bro', 'Slow']) + 'zilla' firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' @@ -26,7 +59,7 @@ def gen_user_agent(is_mobile): return DESKTOP_UA.format(mozilla, linux, firefox) -def gen_query(query, args, config, near_city=None): +def gen_query(query, args, config, near_city=None) -> str: param_dict = {key: '' for key in VALID_PARAMS} # Use :past(hour/day/week/month/year) if available @@ -85,15 +118,56 @@ def gen_query(query, args, config, near_city=None): class Request: - def __init__(self, normal_ua, language='lang_en'): - self.language = language + """Class used for handling all outbound requests, including search queries, + search suggestions, and loading of external content (images, audio, etc). + + Attributes: + normal_ua -- the user's current user agent + root_path -- the root path of the whoogle instance + config -- the user's current whoogle configuration + """ + def __init__(self, normal_ua, root_path, config: Config): + # Send heartbeat to Tor, used in determining if the user can or cannot + # enable Tor for future requests + send_tor_signal(Signal.HEARTBEAT) + + self.language = config.lang_search self.mobile = 'Android' in normal_ua or 'iPhone' in normal_ua self.modified_user_agent = gen_user_agent(self.mobile) + # Set up proxy, if previously configured + if os.environ.get('WHOOGLE_PROXY_LOC'): + auth_str = '' + if os.environ.get('WHOOGLE_PROXY_USER'): + auth_str = os.environ.get('WHOOGLE_PROXY_USER') + \ + ':' + os.environ.get('WHOOGLE_PROXY_PASS') + self.proxies = { + 'http': os.environ.get('WHOOGLE_PROXY_TYPE') + '://' + + auth_str + '@' + os.environ.get('WHOOGLE_PROXY_LOC'), + } + self.proxies['https'] = self.proxies['http'].replace('http', 'https') + else: + self.proxies = { + 'http': 'socks5://127.0.0.1:9050', + 'https': 'socks5://127.0.0.1:9050' + } if config.tor else {} + self.tor = config.tor + self.tor_valid = False + self.root_path = root_path + def __getitem__(self, name): return getattr(self, name) - def autocomplete(self, query): + def autocomplete(self, query) -> list: + """Sends a query to Google's search suggestion service + + Args: + query: The in-progress query to send + + Returns: + list: The list of matches for possible search suggestions + + """ ac_query = dict(hl=self.language, q=query) response = self.send(base_url=AUTOCOMPLETE_URL, query=urlparse.urlencode(ac_query)).text @@ -103,9 +177,45 @@ class Request: return [] - def send(self, base_url=SEARCH_URL, query='') -> Response: + def send(self, base_url=SEARCH_URL, query='', attempt=0) -> Response: + """Sends an outbound request to a URL. Optionally sends the request using Tor, if + enabled by the user. + + Args: + base_url: The URL to use in the request + query: The optional query string for the request + attempt: The number of attempts made for the request (used for cycling + through Tor identities, if enabled) + + Returns: + Response: The Response object returned by the requests call + + """ headers = { 'User-Agent': self.modified_user_agent } - return requests.get(base_url + query, headers=headers) + # Validate Tor connection and request new identity if the last one failed + if self.tor and not send_tor_signal(Signal.NEWNYM if attempt > 0 else Signal.HEARTBEAT): + raise TorError("Tor was previously enabled, but the connection has been dropped. Please check your " + + "Tor configuration and try again.", disable=True) + + # Make sure that the tor connection is valid, if enabled + if self.tor: + tor_check = requests.get('https://check.torproject.org/', proxies=self.proxies, headers=headers) + self.tor_valid = 'Congratulations' in tor_check.text + + if not self.tor_valid: + raise TorError("Tor connection succeeded, but the connection could not be validated by torproject.org", + disable=True) + + response = requests.get(base_url + query, proxies=self.proxies, headers=headers) + + # Retry query with new identity if using Tor (max 10 attempts) + if 'form id="captcha-form"' in response.text and self.tor: + attempt += 1 + if attempt > 10: + raise TorError("Tor query failed -- max attempts exceeded 10") + return self.send(base_url, query, attempt) + + return response diff --git a/app/routes.py b/app/routes.py index da4be87..3916c23 100644 --- a/app/routes.py +++ b/app/routes.py @@ -9,12 +9,12 @@ import uuid from functools import wraps import waitress -from flask import jsonify, make_response, request, redirect, render_template, send_file, session +from flask import jsonify, make_response, request, redirect, render_template, send_file, session, url_for from requests import exceptions from app import app from app.models.config import Config -from app.request import Request +from app.request import Request, TorError from app.utils.session_utils import valid_user_session from app.utils.routing_utils import * @@ -62,13 +62,17 @@ def before_request_func(): if https_only and request.url.startswith('http://'): return redirect(request.url.replace('http://', 'https://', 1), code=308) - + g.user_config = Config(**session['config']) if not g.user_config.url: g.user_config.url = request.url_root.replace('http://', 'https://') if https_only else request.url_root - g.user_request = Request(request.headers.get('User-Agent'), language=g.user_config.lang_search) + g.user_request = Request( + request.headers.get('User-Agent'), + request.url_root, + config=g.user_config) + g.app_location = g.user_config.url @@ -103,11 +107,15 @@ def unknown_page(e): def index(): # Reset keys session['fernet_keys'] = generate_user_keys(g.cookies_disabled) + error_message = session['error_message'] if 'error_message' in session else '' + session['error_message'] = '' return render_template('index.html', languages=Config.LANGUAGES, countries=Config.COUNTRIES, config=g.user_config, + error_message=error_message, + tor_available=int(os.environ.get('TOR_AVAILABLE')), version_number=app.config['VERSION_NUMBER']) @@ -138,7 +146,9 @@ def autocomplete(): elif request.data: q = urlparse.unquote_plus(request.data.decode('utf-8').replace('q=', '')) - return jsonify([q, g.user_request.autocomplete(q)]) + # Return a list of suggestions for the query + # Note: If Tor is enabled, this returns nothing, as the request is almost always rejected + return jsonify([q, g.user_request.autocomplete(q) if not g.user_config.tor else []]) @app.route('/search', methods=['GET', 'POST']) @@ -159,8 +169,14 @@ def search(): return redirect('/') # Generate response and number of external elements from the page - response, elements = search_util.generate_response() - if search_util.feeling_lucky: + try: + response, elements = search_util.generate_response() + except TorError as e: + session['error_message'] = e.message + ("\\n\\nTor config is now disabled!" if e.disable else "") + session['config']['tor'] = False if e.disable else session['config']['tor'] + return redirect(url_for('.index')) + + if search_util.feeling_lucky or elements < 0: return redirect(response, code=303) # Keep count of external elements to fetch before element key can be regenerated @@ -281,6 +297,12 @@ def run_app(): help='Enforces HTTPS redirects for all requests') parser.add_argument('--userpass', default='', metavar='', help='Sets a username/password basic auth combo (default None)') + parser.add_argument('--proxyauth', default='', metavar='', + help='Sets a username/password for a HTTP/SOCKS proxy (default None)') + parser.add_argument('--proxytype', default='', metavar='', + help='Sets a proxy type for all connections (default None)') + parser.add_argument('--proxyloc', default='', metavar='', + help='Sets a proxy location for all connections (default None)') args = parser.parse_args() if args.userpass: @@ -288,6 +310,14 @@ def run_app(): os.environ['WHOOGLE_USER'] = user_pass[0] os.environ['WHOOGLE_PASS'] = user_pass[1] + if args.proxytype and args.proxyloc: + if args.proxyauth: + proxy_user_pass = args.proxyauth.split(':') + os.environ['WHOOGLE_PROXY_USER'] = proxy_user_pass[0] + os.environ['WHOOGLE_PROXY_PASS'] = proxy_user_pass[1] + os.environ['WHOOGLE_PROXY_TYPE'] = args.proxytype + os.environ['WHOOGLE_PROXY_LOC'] = args.proxyloc + os.environ['HTTPS_ONLY'] = '1' if args.https_only else '' if args.debug: diff --git a/app/static/js/controller.js b/app/static/js/controller.js index 156a84d..8775122 100644 --- a/app/static/js/controller.js +++ b/app/static/js/controller.js @@ -1,6 +1,6 @@ // Whoogle configurations that use boolean values and checkboxes CONFIG_BOOLS = [ - "nojs", "dark", "safe", "alts", "new_tab", "get_only" + "nojs", "dark", "safe", "alts", "new_tab", "get_only", "tor" ]; // Whoogle configurations that use string values and input fields diff --git a/app/templates/error.html b/app/templates/error.html index 003623d..9546e23 100644 --- a/app/templates/error.html +++ b/app/templates/error.html @@ -3,3 +3,4 @@

Error parsing "{{ query }}"

+Return Home diff --git a/app/templates/index.html b/app/templates/index.html index 4980316..2f996a3 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -29,6 +29,12 @@ Whoogle Search +
@@ -110,6 +116,10 @@
+
+ + +
diff --git a/app/templates/opensearch.xml b/app/templates/opensearch.xml index 8e2e7b2..c1d2898 100644 --- a/app/templates/opensearch.xml +++ b/app/templates/opensearch.xml @@ -5,12 +5,8 @@ Whoogle: A lightweight, deployable Google search proxy for desktop/mobile that removes Javascript, AMP links, and ads UTF-8  - - - - - - + + {{ main_url }}/search diff --git a/app/utils/routing_utils.py b/app/utils/routing_utils.py index c6c960b..a083da2 100644 --- a/app/utils/routing_utils.py +++ b/app/utils/routing_utils.py @@ -7,6 +7,9 @@ from flask import g from typing import Any, Tuple +TOR_BANNER = '

You are using Tor


' + + class RoutingUtils: def __init__(self, request, config, session, cookies_disabled=False): self.request_params = request.args if request.method == 'GET' else request.form @@ -66,10 +69,13 @@ class RoutingUtils: content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config) full_query = gen_query(self.query, self.request_params, self.config, content_filter.near) - get_body = g.user_request.send(query=full_query).text + get_body = g.user_request.send(query=full_query) # Produce cleanable html soup from response - html_soup = BeautifulSoup(content_filter.reskin(get_body), 'html.parser') + html_soup = BeautifulSoup(content_filter.reskin(get_body.text), 'html.parser') + html_soup.insert(0, BeautifulSoup( + TOR_BANNER, + features='lxml') if g.user_request.tor_valid else BeautifulSoup("", features="lxml")) if self.feeling_lucky: return get_first_link(html_soup), 1 diff --git a/rc/torrc b/rc/torrc new file mode 100644 index 0000000..b162719 --- /dev/null +++ b/rc/torrc @@ -0,0 +1,8 @@ +DataDirectory /var/lib/tor +ControlPort 9051 +CookieAuthentication 1 +DataDirectoryGroupReadable 1 +CookieAuthFileGroupReadable 1 +ExtORPortCookieAuthFileGroupReadable 1 +CacheDirectoryGroupReadable 1 +CookieAuthFile /var/lib/tor/control_auth_cookie diff --git a/requirements.txt b/requirements.txt index ba00e00..508cbcf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,33 @@ +attrs==19.3.0 beautifulsoup4==4.8.2 bs4==0.0.1 +cachelib==0.1 +certifi==2020.4.5.1 cffi==1.13.2 +chardet==3.0.4 Click==7.0 cryptography==3.2 Flask==1.1.1 Flask-Session==0.3.2 +idna==2.9 itsdangerous==1.1.0 Jinja2==2.10.3 lxml==4.5.1 MarkupSafe==1.1.1 +more-itertools==8.3.0 +packaging==20.4 +pluggy==0.13.1 +py==1.8.1 pycparser==2.19 pyOpenSSL==19.1.0 +pyparsing==2.4.7 +PySocks==1.7.1 pytest==5.4.1 python-dateutil==2.8.1 requests==2.23.0 -six==1.14.0 soupsieve==1.9.5 -Werkzeug==0.16.0 +stem==1.8.0 +urllib3==1.25.9 waitress==1.4.3 +wcwidth==0.1.9 +Werkzeug==0.16.0