From 9317d9217f6d2ac39938ee402b9330f54f2f87dd Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Wed, 13 Apr 2022 11:29:07 -0600 Subject: [PATCH] Support proxying results through Whoogle (aka "anonymous view") (#682) * Expand `/window` endpoint to behave like a proxy The `/window` endpoint was previously used as a type of proxy, but only for removing Javascript from the result page. This expands the existing functionality to allow users to proxy search result pages (with or without Javascript) through their Whoogle instance. * Implement filtering of remote content from css * Condense NoJS feature into Anonymous View Enabling NoJS now removes Javascript from the Anonymous View, rather than creating a separate option. * Exclude 'data:' urls from filter, add translations The 'data:' url must be allowed in results to view certain elements on the page, such as stars for review based results. Add translations for the remaining languages. * Add cssutils to requirements --- app/filter.py | 80 +++++++++++++++++--- app/models/config.py | 5 +- app/routes.py | 66 ++++++++++++++--- app/static/css/dark-theme.css | 5 ++ app/static/css/light-theme.css | 5 ++ app/static/settings/translations.json | 102 +++++++++++++++++--------- app/templates/index.html | 4 + app/utils/misc.py | 12 +++ app/utils/results.py | 28 ++++++- app/utils/search.py | 2 + requirements.txt | 1 + 11 files changed, 255 insertions(+), 55 deletions(-) diff --git a/app/filter.py b/app/filter.py index 6b4e284..8931ffb 100644 --- a/app/filter.py +++ b/app/filter.py @@ -2,11 +2,12 @@ from app.models.config import Config from app.models.endpoint import Endpoint from app.models.g_classes import GClasses from app.request import VALID_PARAMS, MAPS_URL -from app.utils.misc import read_config_bool +from app.utils.misc import get_abs_url, read_config_bool from app.utils.results import * from bs4 import BeautifulSoup from bs4.element import ResultSet, Tag from cryptography.fernet import Fernet +import cssutils from flask import render_template import re import urllib.parse as urlparse @@ -53,17 +54,50 @@ def clean_query(query: str) -> str: return query[:query.find('-site:')] if '-site:' in query else query +def clean_css(css: str, page_url: str) -> str: + """Removes all remote URLs from a CSS string. + + Args: + css: The CSS string + + Returns: + str: The filtered CSS, with URLs proxied through Whoogle + """ + sheet = cssutils.parseString(css) + urls = cssutils.getUrls(sheet) + + for url in urls: + abs_url = get_abs_url(url, page_url) + if abs_url.startswith('data:'): + continue + css = css.replace( + url, + f'/element?type=image/png&url={abs_url}' + ) + + return css + + class Filter: # Limit used for determining if a result is a "regular" result or a list # type result (such as "people also asked", "related searches", etc) RESULT_CHILD_LIMIT = 7 - def __init__(self, user_key: str, config: Config, mobile=False) -> None: + def __init__( + self, + user_key: str, + config: Config, + root_url='', + page_url='', + mobile=False) -> None: self.config = config self.mobile = mobile self.user_key = user_key + self.root_url = root_url + self.page_url = page_url self.main_divs = ResultSet('') self._elements = 0 + self._av = set() def __getitem__(self, name): return getattr(self, name) @@ -89,6 +123,7 @@ class Filter: self.remove_block_titles() self.remove_block_url() self.collapse_sections() + self.update_css(soup) self.update_styling(soup) self.remove_block_tabs(soup) @@ -264,7 +299,7 @@ class Filter: # enabled parent.decompose() - def update_element_src(self, element: Tag, mime: str) -> None: + def update_element_src(self, element: Tag, mime: str, attr='src') -> None: """Encrypts the original src of an element and rewrites the element src to use the "/element?src=" pass-through. @@ -272,10 +307,12 @@ class Filter: None (The soup element is modified directly) """ - src = element['src'] + src = element[attr].split(' ')[0] if src.startswith('//'): src = 'https:' + src + elif src.startswith('data:'): + return if src.startswith(LOGO_URL): # Re-brand with Whoogle logo @@ -287,9 +324,29 @@ class Filter: element['src'] = BLANK_B64 return - element['src'] = f'{Endpoint.element}?url=' + self.encrypt_path( - src, - is_element=True) + '&type=' + urlparse.quote(mime) + element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + ( + self.encrypt_path( + src, + is_element=True + ) + '&type=' + urlparse.quote(mime) + ) + + def update_css(self, soup) -> None: + """Updates URLs used in inline styles to be proxied by Whoogle + using the /element endpoint. + + Returns: + None (The soup element is modified directly) + + """ + # Filter all