diff --git a/README.md b/README.md index 1139450..4905bc5 100644 --- a/README.md +++ b/README.md @@ -422,6 +422,8 @@ There are a few optional environment variables available for customizing a Whoog | WHOOGLE_TOR_SERVICE | Enable/disable the Tor service on startup. Default on -- use '0' to disable. | | WHOOGLE_TOR_USE_PASS | Use password authentication for tor control port. | | WHOOGLE_TOR_CONF | The absolute path to the config file containing the password for the tor control port. Default: ./misc/tor/control.conf WHOOGLE_TOR_PASS must be 1 for this to work.| +| WHOOGLE_SHOW_FAVICONS | Show/hide favicons next to search result URLs. Default on. | +| WHOOGLE_UPDATE_CHECK | Enable/disable the automatic daily check for new versions of Whoogle. Default on. | ### Config Environment Variables These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time. @@ -663,12 +665,12 @@ A lot of the app currently piggybacks on Google's existing support for fetching | [https://whoogle.lunar.icu](https://whoogle.lunar.icu) | 🇩🇪 DE | Multi-choice | ✅ | | [https://wgl.frail.duckdns.org](https://wgl.frail.duckdns.org) | 🇧🇷 BR | Multi-choice | | | [https://whoogle.no-logs.com](https://whoogle.no-logs.com/) | 🇸🇪 SE | Multi-choice | | -| [https://search.rubberverse.xyz](https://search.rubberverse.xyz) | 🇵🇱 PL | English | | | [https://whoogle.ftw.lol](https://whoogle.ftw.lol) | 🇩🇪 DE | Multi-choice | | | [https://whoogle-search--replitcomreside.repl.co](https://whoogle-search--replitcomreside.repl.co) | 🇺🇸 US | English | | | [https://search.notrustverify.ch](https://search.notrustverify.ch) | 🇨🇭 CH | Multi-choice | | | [https://whoogle.datura.network](https://whoogle.datura.network) | 🇩🇪 DE | Multi-choice | | | [https://whoogle.yepserver.xyz](https://whoogle.yepserver.xyz) | 🇺🇦 UA | Multi-choice | | +| [https://search.nezumi.party](https://search.nezumi.party) | 🇮🇹 IT | Multi-choice | | * A checkmark in the "Cloudflare" category here refers to the use of the reverse proxy, [Cloudflare](https://cloudflare.com). The checkmark will not be listed for a site which uses Cloudflare DNS but rather the proxying service which grants Cloudflare the ability to monitor traffic to the website. diff --git a/app/filter.py b/app/filter.py index 29aafd1..ff529fb 100644 --- a/app/filter.py +++ b/app/filter.py @@ -29,9 +29,12 @@ unsupported_g_pages = [ 'google.com/preferences', 'google.com/intl', 'advanced_search', - 'tbm=shop' + 'tbm=shop', + 'ageverification.google.co.kr' ] +unsupported_g_divs = ['google.com/preferences?hl=', 'ageverification.google.co.kr'] + def extract_q(q_str: str, href: str) -> str: """Extracts the 'q' element from a result link. This is typically @@ -245,7 +248,9 @@ class Filter: None (The soup object is modified directly) """ # Skip empty, parentless, or internal links - if not link or not link.parent or not link['href'].startswith('http'): + show_favicons = read_config_bool('WHOOGLE_SHOW_FAVICONS', True) + is_valid_link = link and link.parent and link['href'].startswith('http') + if not show_favicons or not is_valid_link: return parent = link.parent @@ -558,7 +563,7 @@ class Filter: link['href'] = link_netloc parent = link.parent - if 'google.com/preferences?hl=' in link_netloc: + if any(divlink in link_netloc for divlink in unsupported_g_divs): # Handle case where a search is performed in a different # language than what is configured. This usually returns a # div with the same classes as normal search results, but with diff --git a/app/routes.py b/app/routes.py index 3fcd0cf..4efc343 100644 --- a/app/routes.py +++ b/app/routes.py @@ -135,7 +135,8 @@ def before_request_func(): # Check for latest version if needed now = datetime.now() - if now - timedelta(hours=24) > app.config['LAST_UPDATE_CHECK']: + needs_update_check = now - timedelta(hours=24) > app.config['LAST_UPDATE_CHECK'] + if read_config_bool('WHOOGLE_UPDATE_CHECK', True) and needs_update_check: app.config['LAST_UPDATE_CHECK'] = now app.config['HAS_UPDATE'] = check_for_update( app.config['RELEASES_URL'], @@ -608,6 +609,26 @@ def page_not_found(e): return render_template('error.html', error_message=str(e)), 404 +@app.errorhandler(Exception) +def internal_error(e): + query = '' + if request.method == 'POST': + query = request.form.get('q') + else: + query = request.args.get('q') + + localization_lang = g.user_config.get_localization_lang() + translation = app.config['TRANSLATIONS'][localization_lang] + return render_template( + 'error.html', + error_message='Internal server error (500)', + translation=translation, + farside='https://farside.link', + config=g.user_config, + query=urlparse.unquote(query), + params=g.user_config.to_params(keys=['preferences'])), 500 + + def run_app() -> None: parser = argparse.ArgumentParser( description='Whoogle Search console runner') @@ -626,6 +647,11 @@ def run_app() -> None: default='', metavar='', help='Listen for app on unix socket instead of host:port') + parser.add_argument( + '--unix-socket-perms', + default='600', + metavar='', + help='Octal permissions to use for the Unix domain socket (default 600)') parser.add_argument( '--debug', default=False, @@ -677,7 +703,7 @@ def run_app() -> None: if args.debug: app.run(host=args.host, port=args.port, debug=args.debug) elif args.unix_socket: - waitress.serve(app, unix_socket=args.unix_socket) + waitress.serve(app, unix_socket=args.unix_socket, unix_socket_perms=args.unix_socket_perms) else: waitress.serve( app, diff --git a/app/static/css/search.css b/app/static/css/search.css index 12c467d..30a1797 100644 --- a/app/static/css/search.css +++ b/app/static/css/search.css @@ -71,7 +71,7 @@ details summary span { padding-right: 5px; } -.sCuL3 { +.has-favicon .sCuL3 { padding-left: 30px; } diff --git a/app/templates/error.html b/app/templates/error.html index d302270..99e87b3 100644 --- a/app/templates/error.html +++ b/app/templates/error.html @@ -20,21 +20,86 @@


- {% if blocked is defined %}

{{ translation['continue-search'] }}

- Whoogle: -
- - {{farside}}/whoogle/search?q={{query}} - -

- Searx: -
- - {{farside}}/searx/search?q={{query}} - + +
+

Other options:

+
- {% endif %}

Return Home diff --git a/app/utils/misc.py b/app/utils/misc.py index d5fa5e6..20705bc 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -56,8 +56,8 @@ def gen_file_hash(path: str, static_file: str) -> str: return filename_split[0] + '.' + file_hash + filename_split[-1] -def read_config_bool(var: str) -> bool: - val = os.getenv(var, '0') +def read_config_bool(var: str, default: bool=False) -> bool: + val = os.getenv(var, '1' if default else '0') # user can specify one of the following values as 'true' inputs (all # variants with upper case letters will also work): # ('true', 't', '1', 'yes', 'y') diff --git a/app/utils/results.py b/app/utils/results.py index 42654e9..c78f866 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -12,7 +12,7 @@ import re import warnings SKIP_ARGS = ['ref_src', 'utm'] -SKIP_PREFIX = ['//www.', '//mobile.', '//m.', 'www.', 'mobile.', 'm.'] +SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] GOOG_STATIC = 'www.gstatic.com' G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif' GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' @@ -152,11 +152,12 @@ def get_first_link(soup: BeautifulSoup) -> str: return '' -def get_site_alt(link: str) -> str: +def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str: """Returns an alternative to a particular site, if one is configured Args: - link: A string result URL to check against the SITE_ALTS map + link: A string result URL to check against the site_alts map + site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS Returns: str: An updated (or ignored) result link @@ -178,9 +179,9 @@ def get_site_alt(link: str) -> str: # "https://medium.com/..." should match, but "philomedium.com" should not) hostcomp = f'{parsed_link.scheme}://{hostname}' - for site_key in SITE_ALTS.keys(): + for site_key in site_alts.keys(): site_alt = f'{parsed_link.scheme}://{site_key}' - if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]: + if not hostname or site_alt not in hostcomp or not site_alts[site_key]: continue # Wikipedia -> Wikiless replacements require the subdomain (if it's @@ -193,9 +194,8 @@ def get_site_alt(link: str) -> str: elif 'medium' in hostname and len(subdomain) > 0: hostname = f'{subdomain}.{hostname}' - parsed_alt = urlparse.urlparse(SITE_ALTS[site_key]) - link = link.replace(hostname, SITE_ALTS[site_key]) + params - + parsed_alt = urlparse.urlparse(site_alts[site_key]) + link = link.replace(hostname, site_alts[site_key]) + params # If a scheme is specified in the alternative, this results in a # replaced link that looks like "https://http://altservice.tld". # In this case, we can remove the original scheme from the result @@ -205,9 +205,12 @@ def get_site_alt(link: str) -> str: for prefix in SKIP_PREFIX: if parsed_alt.scheme: - link = link.replace(prefix, '') + # If a scheme is specified, remove everything before the + # first occurence of it + link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}' else: - link = link.replace(prefix, '//') + # Otherwise, replace the first occurrence of the prefix + link = link.replace(prefix, '//', 1) break return link diff --git a/app/version.py b/app/version.py index c061f7e..d3675b7 100644 --- a/app/version.py +++ b/app/version.py @@ -4,4 +4,4 @@ optional_dev_tag = '' if os.getenv('DEV_BUILD'): optional_dev_tag = '.dev' + os.getenv('DEV_BUILD') -__version__ = '0.8.3' + optional_dev_tag +__version__ = '0.8.4' + optional_dev_tag diff --git a/charts/whoogle/Chart.yaml b/charts/whoogle/Chart.yaml index 07dfcd4..b10c349 100644 --- a/charts/whoogle/Chart.yaml +++ b/charts/whoogle/Chart.yaml @@ -3,7 +3,7 @@ name: whoogle description: A self hosted search engine on Kubernetes type: application version: 0.1.0 -appVersion: 0.8.3 +appVersion: 0.8.4 icon: https://github.com/benbusby/whoogle-search/raw/main/app/static/img/favicon/favicon-96x96.png diff --git a/misc/instances.txt b/misc/instances.txt index 95927b2..3d64425 100644 --- a/misc/instances.txt +++ b/misc/instances.txt @@ -1,6 +1,7 @@ https://search.albony.xyz https://search.garudalinux.org https://search.dr460nf1r3.org +https://search.nezumi.party https://s.tokhmi.xyz https://search.sethforprivacy.com https://whoogle.dcs0.hu @@ -15,7 +16,6 @@ https://whoogle2.ungovernable.men https://whoogle3.ungovernable.men https://wgl.frail.duckdns.org https://whoogle.no-logs.com -https://search.rubberverse.xyz https://whoogle.ftw.lol https://whoogle-search--replitcomreside.repl.co https://search.notrustverify.ch diff --git a/requirements.txt b/requirements.txt index e0966ac..90642a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,13 +7,13 @@ cffi==1.15.1 chardet==5.1.0 click==8.1.3 cryptography==3.3.2; platform_machine == 'armv7l' -cryptography==41.0.4; platform_machine != 'armv7l' +cryptography==41.0.6; platform_machine != 'armv7l' cssutils==2.6.0 defusedxml==0.7.1 Flask==2.3.2 idna==3.4 itsdangerous==2.1.2 -Jinja2==3.1.2 +Jinja2==3.1.3 MarkupSafe==2.1.2 more-itertools==9.0.0 packaging==23.0 @@ -29,9 +29,9 @@ python-dateutil==2.8.2 requests==2.31.0 soupsieve==2.4 stem==1.8.1 -urllib3==1.26.17 +urllib3==1.26.18 validators==0.22.0 waitress==2.1.2 wcwidth==0.2.6 -Werkzeug==2.3.3 +Werkzeug==3.0.1 python-dotenv==0.21.1 diff --git a/setup.cfg b/setup.cfg index 01bdec7..6e61f45 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ install_requires= python-dotenv requests stem + validators waitress [options.extras_require] diff --git a/test/test_results.py b/test/test_results.py index 63ae159..ad0fd3e 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -2,6 +2,7 @@ from bs4 import BeautifulSoup from app.filter import Filter from app.models.config import Config from app.models.endpoint import Endpoint +from app.utils import results from app.utils.session import generate_key from datetime import datetime from dateutil.parser import ParserError, parse @@ -95,13 +96,13 @@ def test_view_my_ip(client): def test_recent_results(client): times = { - 'past year': 365, - 'past month': 31, - 'past week': 7 + 'tbs=qdr:y': 365, + 'tbs=qdr:m': 31, + 'tbs=qdr:w': 7 } for time, num_days in times.items(): - rv = client.get(f'/{Endpoint.search}?q=test :' + time) + rv = client.get(f'/{Endpoint.search}?q=test&' + time) result_divs = get_search_results(rv.data) current_date = datetime.now() @@ -136,3 +137,22 @@ def test_leading_slash_search(client): continue assert link['href'].startswith(f'{Endpoint.search}') + + +def test_site_alt_prefix_skip(): + # Ensure prefixes are skipped correctly for site alts + + # default silte_alts (farside.link) + assert results.get_site_alt(link = 'https://www.reddit.com') == 'https://farside.link/libreddit' + assert results.get_site_alt(link = 'https://www.twitter.com') == 'https://farside.link/nitter' + assert results.get_site_alt(link = 'https://www.youtube.com') == 'https://farside.link/invidious' + + test_site_alts = { + 'reddit.com': 'reddit.endswithmobile.domain', + 'twitter.com': 'https://twitter.endswithm.domain', + 'youtube.com': 'http://yt.endswithwww.domain', + } + # Domains with part of SKIP_PREFIX in them + assert results.get_site_alt(link = 'https://www.reddit.com', site_alts = test_site_alts) == 'https://reddit.endswithmobile.domain' + assert results.get_site_alt(link = 'https://www.twitter.com', site_alts = test_site_alts) == 'https://twitter.endswithm.domain' + assert results.get_site_alt(link = 'https://www.youtube.com', site_alts = test_site_alts) == 'http://yt.endswithwww.domain'