2021-05-27 16:01:57 +00:00
|
|
|
import os
|
2021-10-21 16:42:31 +00:00
|
|
|
import re
|
2021-05-27 16:01:57 +00:00
|
|
|
from typing import Any
|
2022-06-24 16:51:15 +00:00
|
|
|
from app.filter import Filter
|
2021-10-21 16:42:31 +00:00
|
|
|
from app.request import gen_query
|
2022-07-05 16:01:47 +00:00
|
|
|
from app.utils.misc import get_proxy_host_url
|
2022-06-24 16:51:15 +00:00
|
|
|
from app.utils.results import get_first_link
|
2020-12-17 21:06:47 +00:00
|
|
|
from bs4 import BeautifulSoup as bsoup
|
2020-06-02 18:54:47 +00:00
|
|
|
from cryptography.fernet import Fernet, InvalidToken
|
|
|
|
from flask import g
|
2021-05-27 16:01:57 +00:00
|
|
|
|
Add tor and http/socks proxy support (#137)
* Add tor and http/socks proxy support
Allows users to enable/disable tor from the config menu, which will
forward all requests through Tor.
Also adds support for setting environment variables for alternative
proxy support. Setting the following variables will forward requests
through the proxy:
- WHOOGLE_PROXY_USER (optional)
- WHOOGLE_PROXY_PASS (optional)
- WHOOGLE_PROXY_TYPE (required)
- Can be "http", "socks4", or "socks5"
- WHOOGLE_PROXY_LOC (required)
- Format: "<ip address>:<port>"
See #30
* Refactor acquire_tor_conn -> acquire_tor_identity
Also updated travis CI to set up tor
* Add check for Tor socket on init, improve Tor error handling
Initializing the app sends a heartbeat request to Tor to check for
availability, and updates the home page config options accordingly. This
heartbeat is sent on every request, to ensure Tor support can be
reconfigured without restarting the entire app.
If Tor support is enabled, and a subsequent request fails, then a new
TorError exception is raised, and the Tor feature is disabled until a
valid connection is restored.
The max attempts has been updated to 10, since 5 seemed a bit too low
for how quickly the attempts go by.
* Change send_tor_signal arg type, update function doc
send_tor_signal now accepts a stem.Signal arg (a bit cleaner tbh). Also
added the doc string for the "disable" attribute in TorError.
* Fix tor identity logic in Request.send
* Update proxy init, change proxyloc var name
Proxy is now only initialized if both type and location are specified,
as neither have a default fallback and both are required. I suppose the
type could fall back to http, but seems safer this way.
Also refactored proxyurl -> proxyloc for the runtime args in order to
match the Dockerfile args.
* Add tor/proxy support for Docker builds, fix opensearch/init
The Dockerfile is now updated to include support for Tor configuration,
with a working torrc file included in the repo.
An issue with opensearch was fixed as well, which was uncovered during
testing and was simple enough to fix here. Likewise, DDG bang gen was
updated to only ever happen if the file didn't exist previously, as
testing with the file being regenerated every time was tedious.
* Add missing "@" for socks proxy requests
2020-10-29 00:47:42 +00:00
|
|
|
TOR_BANNER = '<hr><h1 style="text-align: center">You are using Tor</h1><hr>'
|
2021-03-21 01:51:24 +00:00
|
|
|
CAPTCHA = 'div class="g-recaptcha"'
|
Add tor and http/socks proxy support (#137)
* Add tor and http/socks proxy support
Allows users to enable/disable tor from the config menu, which will
forward all requests through Tor.
Also adds support for setting environment variables for alternative
proxy support. Setting the following variables will forward requests
through the proxy:
- WHOOGLE_PROXY_USER (optional)
- WHOOGLE_PROXY_PASS (optional)
- WHOOGLE_PROXY_TYPE (required)
- Can be "http", "socks4", or "socks5"
- WHOOGLE_PROXY_LOC (required)
- Format: "<ip address>:<port>"
See #30
* Refactor acquire_tor_conn -> acquire_tor_identity
Also updated travis CI to set up tor
* Add check for Tor socket on init, improve Tor error handling
Initializing the app sends a heartbeat request to Tor to check for
availability, and updates the home page config options accordingly. This
heartbeat is sent on every request, to ensure Tor support can be
reconfigured without restarting the entire app.
If Tor support is enabled, and a subsequent request fails, then a new
TorError exception is raised, and the Tor feature is disabled until a
valid connection is restored.
The max attempts has been updated to 10, since 5 seemed a bit too low
for how quickly the attempts go by.
* Change send_tor_signal arg type, update function doc
send_tor_signal now accepts a stem.Signal arg (a bit cleaner tbh). Also
added the doc string for the "disable" attribute in TorError.
* Fix tor identity logic in Request.send
* Update proxy init, change proxyloc var name
Proxy is now only initialized if both type and location are specified,
as neither have a default fallback and both are required. I suppose the
type could fall back to http, but seems safer this way.
Also refactored proxyurl -> proxyloc for the runtime args in order to
match the Dockerfile args.
* Add tor/proxy support for Docker builds, fix opensearch/init
The Dockerfile is now updated to include support for Tor configuration,
with a working torrc file included in the repo.
An issue with opensearch was fixed as well, which was uncovered during
testing and was simple enough to fix here. Likewise, DDG bang gen was
updated to only ever happen if the file didn't exist previously, as
testing with the file being regenerated every time was tedious.
* Add missing "@" for socks proxy requests
2020-10-29 00:47:42 +00:00
|
|
|
|
|
|
|
|
2021-01-23 19:50:30 +00:00
|
|
|
def needs_https(url: str) -> bool:
|
2021-03-08 17:22:04 +00:00
|
|
|
"""Checks if the current instance needs to be upgraded to HTTPS
|
|
|
|
|
|
|
|
Note that all Heroku instances are available by default over HTTPS, but
|
|
|
|
do not automatically set up a redirect when visited over HTTP.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The instance url
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: True/False representing the need to upgrade
|
|
|
|
|
|
|
|
"""
|
2021-03-24 19:13:52 +00:00
|
|
|
https_only = bool(os.getenv('HTTPS_ONLY', 0))
|
2021-04-01 14:21:59 +00:00
|
|
|
is_heroku = url.endswith('.herokuapp.com')
|
2021-01-23 19:50:30 +00:00
|
|
|
is_http = url.startswith('http://')
|
|
|
|
|
|
|
|
return (is_heroku and is_http) or (https_only and is_http)
|
|
|
|
|
|
|
|
|
2021-03-24 19:13:52 +00:00
|
|
|
def has_captcha(results: str) -> bool:
|
|
|
|
"""Checks to see if the search results are blocked by a captcha
|
|
|
|
|
|
|
|
Args:
|
|
|
|
results: The search page html as a string
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: True/False indicating if a captcha element was found
|
|
|
|
|
|
|
|
"""
|
|
|
|
return CAPTCHA in results
|
2021-03-21 01:51:24 +00:00
|
|
|
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
class Search:
|
|
|
|
"""Search query preprocessor - used before submitting the query or
|
|
|
|
redirecting to another site
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
request: the incoming flask request
|
|
|
|
config: the current user config settings
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
session_key: the flask user fernet key
|
2021-03-08 17:22:04 +00:00
|
|
|
"""
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
def __init__(self, request, config, session_key, cookies_disabled=False):
|
2020-12-17 21:06:47 +00:00
|
|
|
method = request.method
|
2022-04-13 17:29:07 +00:00
|
|
|
self.request = request
|
2020-12-17 21:06:47 +00:00
|
|
|
self.request_params = request.args if method == 'GET' else request.form
|
2020-06-02 18:54:47 +00:00
|
|
|
self.user_agent = request.headers.get('User-Agent')
|
|
|
|
self.feeling_lucky = False
|
|
|
|
self.config = config
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
self.session_key = session_key
|
2020-06-02 18:54:47 +00:00
|
|
|
self.query = ''
|
2020-06-05 21:24:44 +00:00
|
|
|
self.cookies_disabled = cookies_disabled
|
2020-12-17 21:06:47 +00:00
|
|
|
self.search_type = self.request_params.get(
|
|
|
|
'tbm') if 'tbm' in self.request_params else ''
|
2020-06-02 18:54:47 +00:00
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def __getitem__(self, name) -> Any:
|
2020-06-02 18:54:47 +00:00
|
|
|
return getattr(self, name)
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def __setitem__(self, name, value) -> None:
|
2020-06-02 18:54:47 +00:00
|
|
|
return setattr(self, name, value)
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def __delitem__(self, name) -> None:
|
2020-06-02 18:54:47 +00:00
|
|
|
return delattr(self, name)
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def __contains__(self, name) -> bool:
|
2020-06-02 18:54:47 +00:00
|
|
|
return hasattr(self, name)
|
|
|
|
|
|
|
|
def new_search_query(self) -> str:
|
2021-03-08 17:22:04 +00:00
|
|
|
"""Parses a plaintext query into a valid string for submission
|
|
|
|
|
|
|
|
Also decrypts the query string, if encrypted (in the case of
|
|
|
|
paginated results).
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
str: A valid query string
|
|
|
|
|
|
|
|
"""
|
2020-06-02 18:54:47 +00:00
|
|
|
q = self.request_params.get('q')
|
|
|
|
|
|
|
|
if q is None or len(q) == 0:
|
|
|
|
return ''
|
|
|
|
else:
|
|
|
|
# Attempt to decrypt if this is an internal link
|
|
|
|
try:
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
q = Fernet(self.session_key).decrypt(q.encode()).decode()
|
2020-06-02 18:54:47 +00:00
|
|
|
except InvalidToken:
|
|
|
|
pass
|
|
|
|
|
2020-12-17 21:06:47 +00:00
|
|
|
# Strip leading '! ' for "feeling lucky" queries
|
2020-06-02 18:54:47 +00:00
|
|
|
self.feeling_lucky = q.startswith('! ')
|
|
|
|
self.query = q[2:] if self.feeling_lucky else q
|
|
|
|
return self.query
|
|
|
|
|
2021-04-01 04:23:30 +00:00
|
|
|
def generate_response(self) -> str:
|
2021-03-08 17:22:04 +00:00
|
|
|
"""Generates a response for the user's query
|
2020-12-17 21:06:47 +00:00
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
Returns:
|
2021-04-01 04:23:30 +00:00
|
|
|
str: A string response to the search query, in the form of a URL
|
|
|
|
or string representation of HTML content.
|
2020-06-25 22:26:02 +00:00
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
"""
|
2020-06-02 18:54:47 +00:00
|
|
|
mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent
|
2022-07-05 16:01:47 +00:00
|
|
|
# reconstruct url if X-Forwarded-Host header present
|
2022-08-02 16:55:45 +00:00
|
|
|
root_url = get_proxy_host_url(
|
|
|
|
self.request,
|
|
|
|
self.request.url_root,
|
|
|
|
root=True)
|
2020-06-02 18:54:47 +00:00
|
|
|
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
content_filter = Filter(self.session_key,
|
2022-07-05 16:01:47 +00:00
|
|
|
root_url=root_url,
|
2021-03-24 19:13:52 +00:00
|
|
|
mobile=mobile,
|
2022-06-03 20:03:57 +00:00
|
|
|
config=self.config,
|
|
|
|
query=self.query)
|
2021-03-24 19:13:52 +00:00
|
|
|
full_query = gen_query(self.query,
|
|
|
|
self.request_params,
|
2021-12-07 04:39:50 +00:00
|
|
|
self.config)
|
2022-02-07 17:47:25 +00:00
|
|
|
self.full_query = full_query
|
2021-04-16 14:16:14 +00:00
|
|
|
|
|
|
|
# force mobile search when view image is true and
|
|
|
|
# the request is not already made by a mobile
|
|
|
|
view_image = ('tbm=isch' in full_query
|
|
|
|
and self.config.view_image
|
|
|
|
and not g.user_request.mobile)
|
|
|
|
|
|
|
|
get_body = g.user_request.send(query=full_query,
|
|
|
|
force_mobile=view_image)
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
|
|
# Produce cleanable html soup from response
|
2022-12-29 22:19:28 +00:00
|
|
|
get_body_safed = get_body.text.replace("<","andlt;").replace(">","andgt;")
|
|
|
|
html_soup = bsoup(get_body_safed, 'html.parser')
|
2021-03-24 19:13:52 +00:00
|
|
|
|
2021-04-16 14:16:14 +00:00
|
|
|
# Replace current soup if view_image is active
|
|
|
|
if view_image:
|
|
|
|
html_soup = content_filter.view_image(html_soup)
|
|
|
|
|
2021-03-24 19:13:52 +00:00
|
|
|
# Indicate whether or not a Tor connection is active
|
|
|
|
if g.user_request.tor_valid:
|
2021-11-13 00:19:45 +00:00
|
|
|
html_soup.insert(0, bsoup(TOR_BANNER, 'html.parser'))
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
|
|
if self.feeling_lucky:
|
2021-04-01 04:23:30 +00:00
|
|
|
return get_first_link(html_soup)
|
2020-06-02 18:54:47 +00:00
|
|
|
else:
|
|
|
|
formatted_results = content_filter.clean(html_soup)
|
2020-11-11 05:40:49 +00:00
|
|
|
|
|
|
|
# Append user config to all search links, if available
|
2020-12-17 21:06:47 +00:00
|
|
|
param_str = ''.join('&{}={}'.format(k, v)
|
|
|
|
for k, v in
|
|
|
|
self.request_params.to_dict(flat=True).items()
|
|
|
|
if self.config.is_safe_key(k))
|
2020-11-11 05:40:49 +00:00
|
|
|
for link in formatted_results.find_all('a', href=True):
|
2022-03-28 16:11:09 +00:00
|
|
|
link['rel'] = "nofollow noopener noreferrer"
|
2020-12-17 21:06:47 +00:00
|
|
|
if 'search?' not in link['href'] or link['href'].index(
|
|
|
|
'search?') > 1:
|
2020-11-11 05:40:49 +00:00
|
|
|
continue
|
|
|
|
link['href'] += param_str
|
|
|
|
|
2021-04-01 04:23:30 +00:00
|
|
|
return str(formatted_results)
|
2021-10-21 16:42:31 +00:00
|
|
|
|
|
|
|
def check_kw_ip(self) -> re.Match:
|
|
|
|
"""Checks for keywords related to 'my ip' in the query
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool
|
|
|
|
|
|
|
|
"""
|
|
|
|
return re.search("([^a-z0-9]|^)my *[^a-z0-9] *(ip|internet protocol)" +
|
|
|
|
"($|( *[^a-z0-9] *(((addres|address|adres|" +
|
|
|
|
"adress)|a)? *$)))", self.query.lower())
|