Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
from app.models.endpoint import Endpoint
|
2021-10-26 22:15:24 +00:00
|
|
|
from bs4 import BeautifulSoup, NavigableString
|
2021-11-01 21:34:59 +00:00
|
|
|
import html
|
2020-12-05 22:01:21 +00:00
|
|
|
import os
|
2020-07-26 17:53:59 +00:00
|
|
|
import urllib.parse as urlparse
|
|
|
|
from urllib.parse import parse_qs
|
2021-10-26 20:59:23 +00:00
|
|
|
import re
|
2020-07-26 17:53:59 +00:00
|
|
|
|
|
|
|
SKIP_ARGS = ['ref_src', 'utm']
|
2021-01-23 22:43:53 +00:00
|
|
|
SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
|
2021-02-20 20:04:32 +00:00
|
|
|
GOOG_STATIC = 'www.gstatic.com'
|
2020-07-26 17:53:59 +00:00
|
|
|
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
|
|
|
|
LOGO_URL = GOOG_IMG + '_desk'
|
2020-12-17 21:06:47 +00:00
|
|
|
BLANK_B64 = ('data:image/png;base64,'
|
|
|
|
'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw'
|
|
|
|
'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC')
|
2020-07-26 17:53:59 +00:00
|
|
|
|
2020-12-17 21:06:47 +00:00
|
|
|
# Ad keywords
|
2020-07-26 17:53:59 +00:00
|
|
|
BLACKLIST = [
|
2020-12-17 21:06:47 +00:00
|
|
|
'ad', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', 'Reklama',
|
|
|
|
'Реклама', 'Anunț', '광고', 'annons', 'Annonse', 'Iklan', '広告', 'Augl.',
|
|
|
|
'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', 'Reklam', 'آگهی',
|
|
|
|
'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés', 'Anúncio'
|
2020-07-26 17:53:59 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
SITE_ALTS = {
|
Use farside.link for frontend alternatives in results (#560)
* Integrate Farside into Whoogle
When instances are ratelimited (when a captcha is returned instead of
the user's search results) the user can now hop to a new instance via
Farside, a new backend service that redirects users to working instances
of a particular frontend. In this case, it presents a user with a
Farside link to a new Whoogle (or Searx) instance instead, so that the
user can resume their search.
For the generated Farside->Whoogle link, the generated link includes the
user's current Whoogle configuration settings as URL params, to ensure a
more seamless transition between instances. This doesn't translate to
the Farside->Searx link, but potentially could with some changes.
* Expand conversion of config<->url params
Config settings can now be translated to and from URL params using a
predetermined set of "safe" keys (i.e. config settings that easily
translate to URL params).
* Allow jumping instances via Farside when ratelimited
When instances are ratelimited (when a captcha is returned instead of
the user's search results) the user can now hop to a new instance via
Farside, a new backend service that redirects users to working instances
of a particular frontend. In this case, it presents a user with a
Farside link to a new Whoogle (or Searx) instance instead, so that the
user can resume their search.
For the generated Farside->Whoogle link, the generated link includes the
user's current Whoogle configuration settings as URL params, to ensure a
more seamless transition between instances. This doesn't translate to
the Farside->Searx link, but potentially could with some changes.
Closes #554
Closes #559
2021-12-09 00:27:33 +00:00
|
|
|
'twitter.com': os.getenv('WHOOGLE_ALT_TW', 'farside.link/nitter'),
|
|
|
|
'youtube.com': os.getenv('WHOOGLE_ALT_YT', 'farside.link/invidious'),
|
|
|
|
'instagram.com': os.getenv('WHOOGLE_ALT_IG', 'farside.link/bibliogram/u'),
|
|
|
|
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
|
2021-10-24 05:23:37 +00:00
|
|
|
**dict.fromkeys([
|
|
|
|
'medium.com',
|
|
|
|
'levelup.gitconnected.com'
|
2022-01-14 16:59:03 +00:00
|
|
|
], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
|
|
|
|
'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'imgin.voidnet.tech'),
|
|
|
|
'wikipedia.com': os.getenv('WHOOGLE_ALT_WIKI', 'wikiless.org')
|
2020-07-26 17:53:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-10-26 20:59:23 +00:00
|
|
|
def bold_search_terms(response: str, query: str) -> BeautifulSoup:
|
|
|
|
"""Wraps all search terms in bold tags (<b>). If any terms are wrapped
|
|
|
|
in quotes, only that exact phrase will be made bold.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
response: The initial response body for the query
|
|
|
|
query: The original search query
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
BeautifulSoup: modified soup object with bold items
|
|
|
|
"""
|
|
|
|
response = BeautifulSoup(response, 'html.parser')
|
|
|
|
|
|
|
|
def replace_any_case(element: NavigableString, target_word: str) -> None:
|
|
|
|
# Replace all instances of the word, but maintaining the same case in
|
|
|
|
# the replacement
|
2021-10-26 22:15:24 +00:00
|
|
|
if len(element) == len(target_word):
|
|
|
|
return
|
|
|
|
|
2021-10-28 18:54:27 +00:00
|
|
|
if not re.match('.*[a-zA-Z0-9].*', target_word) or (
|
|
|
|
element.parent and element.parent.name == 'style'):
|
2021-10-27 16:50:21 +00:00
|
|
|
return
|
|
|
|
|
2021-11-01 21:34:59 +00:00
|
|
|
element.replace_with(BeautifulSoup(
|
2021-10-26 22:21:30 +00:00
|
|
|
re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
|
2021-10-26 22:17:38 +00:00
|
|
|
r'<b>\1</b>',
|
2021-11-01 21:34:59 +00:00
|
|
|
html.escape(element),
|
|
|
|
flags=re.I), 'html.parser')
|
2021-10-26 20:59:23 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
# Split all words out of query, grouping the ones wrapped in quotes
|
|
|
|
for word in re.split(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', query):
|
|
|
|
word = re.sub(r'[^A-Za-z0-9 ]+', '', word)
|
|
|
|
target = response.find_all(
|
|
|
|
text=re.compile(r'' + re.escape(word), re.I))
|
|
|
|
for nav_str in target:
|
|
|
|
replace_any_case(nav_str, word)
|
|
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def has_ad_content(element: str) -> bool:
|
|
|
|
"""Inspects an HTML element for ad related content
|
|
|
|
|
|
|
|
Args:
|
|
|
|
element: The HTML element to inspect
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: True/False for the element containing an ad
|
|
|
|
|
|
|
|
"""
|
2021-03-08 17:38:40 +00:00
|
|
|
return (element.upper() in (value.upper() for value in BLACKLIST)
|
|
|
|
or 'ⓘ' in element)
|
2020-07-26 17:53:59 +00:00
|
|
|
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def get_first_link(soup: BeautifulSoup) -> str:
|
|
|
|
"""Retrieves the first result link from the query response
|
|
|
|
|
|
|
|
Args:
|
|
|
|
soup: The BeautifulSoup response body
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
str: A str link to the first result
|
|
|
|
|
|
|
|
"""
|
2020-07-26 17:53:59 +00:00
|
|
|
# Replace hrefs with only the intended destination (no "utm" type tags)
|
|
|
|
for a in soup.find_all('a', href=True):
|
|
|
|
# Return the first search result URL
|
|
|
|
if 'url?q=' in a['href']:
|
|
|
|
return filter_link_args(a['href'])
|
2021-03-24 19:13:52 +00:00
|
|
|
return ''
|
2020-07-26 17:53:59 +00:00
|
|
|
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def get_site_alt(link: str) -> str:
|
|
|
|
"""Returns an alternative to a particular site, if one is configured
|
|
|
|
|
|
|
|
Args:
|
|
|
|
link: A string result URL to check against the SITE_ALTS map
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
str: An updated (or ignored) result link
|
|
|
|
|
|
|
|
"""
|
2021-10-16 18:22:00 +00:00
|
|
|
# Need to replace full hostname with alternative to encapsulate
|
|
|
|
# subdomains as well
|
|
|
|
hostname = urlparse.urlparse(link).hostname
|
2021-03-28 17:27:08 +00:00
|
|
|
|
2020-07-26 17:53:59 +00:00
|
|
|
for site_key in SITE_ALTS.keys():
|
2021-10-16 18:22:00 +00:00
|
|
|
if not hostname or site_key not in hostname:
|
2020-07-26 17:53:59 +00:00
|
|
|
continue
|
|
|
|
|
2021-10-16 18:22:00 +00:00
|
|
|
link = link.replace(hostname, SITE_ALTS[site_key])
|
2021-10-11 20:25:21 +00:00
|
|
|
for prefix in SKIP_PREFIX:
|
|
|
|
link = link.replace(prefix, '//')
|
2020-07-26 17:53:59 +00:00
|
|
|
break
|
|
|
|
|
2021-01-23 22:43:53 +00:00
|
|
|
return link
|
2020-07-26 17:53:59 +00:00
|
|
|
|
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
def filter_link_args(link: str) -> str:
|
|
|
|
"""Filters out unnecessary URL args from a result link
|
|
|
|
|
|
|
|
Args:
|
|
|
|
link: The string result link to check for extraneous URL params
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
str: An updated (or ignored) result link
|
|
|
|
|
|
|
|
"""
|
|
|
|
parsed_link = urlparse.urlparse(link)
|
2020-07-26 17:53:59 +00:00
|
|
|
link_args = parse_qs(parsed_link.query)
|
|
|
|
safe_args = {}
|
|
|
|
|
|
|
|
if len(link_args) == 0 and len(parsed_link) > 0:
|
2021-03-08 17:22:04 +00:00
|
|
|
return link
|
2020-07-26 17:53:59 +00:00
|
|
|
|
|
|
|
for arg in link_args.keys():
|
|
|
|
if arg in SKIP_ARGS:
|
|
|
|
continue
|
|
|
|
|
|
|
|
safe_args[arg] = link_args[arg]
|
|
|
|
|
|
|
|
# Remove original link query and replace with filtered args
|
2021-03-08 17:22:04 +00:00
|
|
|
link = link.replace(parsed_link.query, '')
|
2020-07-26 17:53:59 +00:00
|
|
|
if len(safe_args) > 0:
|
2021-03-08 17:22:04 +00:00
|
|
|
link = link + urlparse.urlencode(safe_args, doseq=True)
|
2020-07-26 17:53:59 +00:00
|
|
|
else:
|
2021-03-08 17:22:04 +00:00
|
|
|
link = link.replace('?', '')
|
|
|
|
|
|
|
|
return link
|
|
|
|
|
|
|
|
|
|
|
|
def append_nojs(result: BeautifulSoup) -> None:
|
|
|
|
"""Appends a no-Javascript alternative for a search result
|
2020-07-26 17:53:59 +00:00
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
Args:
|
|
|
|
result: The search result to append a no-JS link to
|
2020-07-26 17:53:59 +00:00
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
Returns:
|
|
|
|
None
|
2020-07-26 17:53:59 +00:00
|
|
|
|
2021-03-08 17:22:04 +00:00
|
|
|
"""
|
2020-12-29 23:43:42 +00:00
|
|
|
nojs_link = BeautifulSoup(features='html.parser').new_tag('a')
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
nojs_link['href'] = f'/{Endpoint.window}?location=' + result['href']
|
2021-11-01 22:03:47 +00:00
|
|
|
nojs_link.string = ' NoJS Link'
|
2021-03-08 17:22:04 +00:00
|
|
|
result.append(nojs_link)
|
2021-10-21 16:42:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
def add_ip_card(html_soup: BeautifulSoup, ip: str) -> BeautifulSoup:
|
|
|
|
"""Adds the client's IP address to the search results
|
|
|
|
if query contains keywords
|
|
|
|
|
|
|
|
Args:
|
|
|
|
html_soup: The parsed search result containing the keywords
|
|
|
|
ip: ip address of the client
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
BeautifulSoup
|
|
|
|
|
|
|
|
"""
|
|
|
|
if (not html_soup.select_one(".EY24We")
|
|
|
|
and html_soup.select_one(".OXXup").get_text().lower() == "all"):
|
|
|
|
# HTML IP card tag
|
|
|
|
ip_tag = html_soup.new_tag("div")
|
|
|
|
ip_tag["class"] = "ZINbbc xpd O9g5cc uUPGi"
|
|
|
|
|
|
|
|
# For IP Address html tag
|
|
|
|
ip_address = html_soup.new_tag("div")
|
|
|
|
ip_address["class"] = "kCrYT ip-address-div"
|
|
|
|
ip_address.string = ip
|
|
|
|
|
|
|
|
# Text below the IP address
|
|
|
|
ip_text = html_soup.new_tag("div")
|
|
|
|
ip_text.string = "Your public IP address"
|
|
|
|
ip_text["class"] = "kCrYT ip-text-div"
|
|
|
|
|
|
|
|
# Adding all the above html tags to the IP card
|
|
|
|
ip_tag.append(ip_address)
|
|
|
|
ip_tag.append(ip_text)
|
|
|
|
|
|
|
|
# Finding the element before which the IP card would be placed
|
|
|
|
f_link = html_soup.select_one(".BNeawe.vvjwJb.AP7Wnd")
|
|
|
|
ref_element = f_link.find_parent(class_="ZINbbc xpd O9g5cc" +
|
|
|
|
" uUPGi")
|
|
|
|
|
|
|
|
# Inserting the element
|
|
|
|
ref_element.insert_before(ip_tag)
|
|
|
|
return html_soup
|
2021-12-07 05:56:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
def check_currency(response: str) -> dict:
|
|
|
|
"""Check whether the results have currency conversion
|
|
|
|
|
|
|
|
Args:
|
|
|
|
response: Search query Result
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
dict: Consists of currency names and values
|
|
|
|
|
|
|
|
"""
|
|
|
|
soup = BeautifulSoup(response, 'html.parser')
|
|
|
|
currency_link = soup.find('a', {'href': 'https://g.co/gfd'})
|
|
|
|
if currency_link:
|
|
|
|
while 'class' not in currency_link.attrs or \
|
|
|
|
'ZINbbc' not in currency_link.attrs['class']:
|
|
|
|
currency_link = currency_link.parent
|
|
|
|
currency_link = currency_link.find_all(class_='BNeawe')
|
|
|
|
currency1 = currency_link[0].text
|
|
|
|
currency2 = currency_link[1].text
|
|
|
|
currency1 = currency1.rstrip('=').split(' ', 1)
|
|
|
|
currency2 = currency2.split(' ', 1)
|
|
|
|
if currency2[0][-3] == ',':
|
|
|
|
currency1[0] = currency1[0].replace('.', '')
|
|
|
|
currency1[0] = currency1[0].replace(',', '.')
|
|
|
|
currency2[0] = currency2[0].replace('.', '')
|
|
|
|
currency2[0] = currency2[0].replace(',', '.')
|
|
|
|
else:
|
|
|
|
currency1[0] = currency1[0].replace(',', '')
|
|
|
|
currency2[0] = currency2[0].replace(',', '')
|
|
|
|
return {'currencyValue1': float(currency1[0]),
|
|
|
|
'currencyLabel1': currency1[1],
|
|
|
|
'currencyValue2': float(currency2[0]),
|
|
|
|
'currencyLabel2': currency2[1]
|
|
|
|
}
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
def add_currency_card(soup: BeautifulSoup,
|
|
|
|
conversion_details: dict) -> BeautifulSoup:
|
|
|
|
"""Adds the currency conversion boxes
|
|
|
|
to response of the search query
|
|
|
|
|
|
|
|
Args:
|
|
|
|
soup: Parsed search result
|
|
|
|
conversion_details: Dictionary of currency
|
|
|
|
related information
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
BeautifulSoup
|
|
|
|
"""
|
|
|
|
# Element before which the code will be changed
|
|
|
|
# (This is the 'disclaimer' link)
|
|
|
|
element1 = soup.find('a', {'href': 'https://g.co/gfd'})
|
|
|
|
|
|
|
|
while 'class' not in element1.attrs or \
|
|
|
|
'nXE3Ob' not in element1.attrs['class']:
|
|
|
|
element1 = element1.parent
|
|
|
|
|
|
|
|
# Creating the conversion factor
|
|
|
|
conversion_factor = (conversion_details['currencyValue1'] /
|
|
|
|
conversion_details['currencyValue2'])
|
|
|
|
|
|
|
|
# Creating a new div for the input boxes
|
|
|
|
conversion_box = soup.new_tag('div')
|
|
|
|
conversion_box['class'] = 'conversion_box'
|
|
|
|
|
|
|
|
# Currency to be converted from
|
|
|
|
input_box1 = soup.new_tag('input')
|
|
|
|
input_box1['id'] = 'cb1'
|
|
|
|
input_box1['type'] = 'number'
|
|
|
|
input_box1['class'] = 'cb'
|
|
|
|
input_box1['value'] = conversion_details['currencyValue1']
|
|
|
|
input_box1['oninput'] = f'convert(1, 2, {1 / conversion_factor})'
|
|
|
|
|
|
|
|
label_box1 = soup.new_tag('label')
|
|
|
|
label_box1['for'] = 'cb1'
|
|
|
|
label_box1['class'] = 'cb_label'
|
|
|
|
label_box1.append(conversion_details['currencyLabel1'])
|
|
|
|
|
|
|
|
br = soup.new_tag('br')
|
|
|
|
|
|
|
|
# Currency to be converted to
|
|
|
|
input_box2 = soup.new_tag('input')
|
|
|
|
input_box2['id'] = 'cb2'
|
|
|
|
input_box2['type'] = 'number'
|
|
|
|
input_box2['class'] = 'cb'
|
|
|
|
input_box2['value'] = conversion_details['currencyValue2']
|
|
|
|
input_box2['oninput'] = f'convert(2, 1, {conversion_factor})'
|
|
|
|
|
|
|
|
label_box2 = soup.new_tag('label')
|
|
|
|
label_box2['for'] = 'cb2'
|
|
|
|
label_box2['class'] = 'cb_label'
|
|
|
|
label_box2.append(conversion_details['currencyLabel2'])
|
|
|
|
|
|
|
|
conversion_box.append(input_box1)
|
|
|
|
conversion_box.append(label_box1)
|
|
|
|
conversion_box.append(br)
|
|
|
|
conversion_box.append(input_box2)
|
|
|
|
conversion_box.append(label_box2)
|
|
|
|
|
|
|
|
element1.insert_before(conversion_box)
|
|
|
|
return soup
|