Merge branch 'benbusby:main' into main

pull/1134/head
Thomas Fournier 3 months ago committed by GitHub
commit bc78a658e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -422,6 +422,8 @@ There are a few optional environment variables available for customizing a Whoog
| WHOOGLE_TOR_SERVICE | Enable/disable the Tor service on startup. Default on -- use '0' to disable. |
| WHOOGLE_TOR_USE_PASS | Use password authentication for tor control port. |
| WHOOGLE_TOR_CONF | The absolute path to the config file containing the password for the tor control port. Default: ./misc/tor/control.conf WHOOGLE_TOR_PASS must be 1 for this to work.|
| WHOOGLE_SHOW_FAVICONS | Show/hide favicons next to search result URLs. Default on. |
| WHOOGLE_UPDATE_CHECK | Enable/disable the automatic daily check for new versions of Whoogle. Default on. |
### Config Environment Variables
These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time.
@ -663,12 +665,12 @@ A lot of the app currently piggybacks on Google's existing support for fetching
| [https://whoogle.lunar.icu](https://whoogle.lunar.icu) | 🇩🇪 DE | Multi-choice | ✅ |
| [https://wgl.frail.duckdns.org](https://wgl.frail.duckdns.org) | 🇧🇷 BR | Multi-choice | |
| [https://whoogle.no-logs.com](https://whoogle.no-logs.com/) | 🇸🇪 SE | Multi-choice | |
| [https://search.rubberverse.xyz](https://search.rubberverse.xyz) | 🇵🇱 PL | English | |
| [https://whoogle.ftw.lol](https://whoogle.ftw.lol) | 🇩🇪 DE | Multi-choice | |
| [https://whoogle-search--replitcomreside.repl.co](https://whoogle-search--replitcomreside.repl.co) | 🇺🇸 US | English | |
| [https://search.notrustverify.ch](https://search.notrustverify.ch) | 🇨🇭 CH | Multi-choice | |
| [https://whoogle.datura.network](https://whoogle.datura.network) | 🇩🇪 DE | Multi-choice | |
| [https://whoogle.yepserver.xyz](https://whoogle.yepserver.xyz) | 🇺🇦 UA | Multi-choice | |
| [https://search.nezumi.party](https://search.nezumi.party) | 🇮🇹 IT | Multi-choice | |
* A checkmark in the "Cloudflare" category here refers to the use of the reverse proxy, [Cloudflare](https://cloudflare.com). The checkmark will not be listed for a site which uses Cloudflare DNS but rather the proxying service which grants Cloudflare the ability to monitor traffic to the website.

@ -29,9 +29,12 @@ unsupported_g_pages = [
'google.com/preferences',
'google.com/intl',
'advanced_search',
'tbm=shop'
'tbm=shop',
'ageverification.google.co.kr'
]
unsupported_g_divs = ['google.com/preferences?hl=', 'ageverification.google.co.kr']
def extract_q(q_str: str, href: str) -> str:
"""Extracts the 'q' element from a result link. This is typically
@ -245,7 +248,9 @@ class Filter:
None (The soup object is modified directly)
"""
# Skip empty, parentless, or internal links
if not link or not link.parent or not link['href'].startswith('http'):
show_favicons = read_config_bool('WHOOGLE_SHOW_FAVICONS', True)
is_valid_link = link and link.parent and link['href'].startswith('http')
if not show_favicons or not is_valid_link:
return
parent = link.parent
@ -558,7 +563,7 @@ class Filter:
link['href'] = link_netloc
parent = link.parent
if 'google.com/preferences?hl=' in link_netloc:
if any(divlink in link_netloc for divlink in unsupported_g_divs):
# Handle case where a search is performed in a different
# language than what is configured. This usually returns a
# div with the same classes as normal search results, but with

@ -135,7 +135,8 @@ def before_request_func():
# Check for latest version if needed
now = datetime.now()
if now - timedelta(hours=24) > app.config['LAST_UPDATE_CHECK']:
needs_update_check = now - timedelta(hours=24) > app.config['LAST_UPDATE_CHECK']
if read_config_bool('WHOOGLE_UPDATE_CHECK', True) and needs_update_check:
app.config['LAST_UPDATE_CHECK'] = now
app.config['HAS_UPDATE'] = check_for_update(
app.config['RELEASES_URL'],
@ -608,6 +609,26 @@ def page_not_found(e):
return render_template('error.html', error_message=str(e)), 404
@app.errorhandler(Exception)
def internal_error(e):
query = ''
if request.method == 'POST':
query = request.form.get('q')
else:
query = request.args.get('q')
localization_lang = g.user_config.get_localization_lang()
translation = app.config['TRANSLATIONS'][localization_lang]
return render_template(
'error.html',
error_message='Internal server error (500)',
translation=translation,
farside='https://farside.link',
config=g.user_config,
query=urlparse.unquote(query),
params=g.user_config.to_params(keys=['preferences'])), 500
def run_app() -> None:
parser = argparse.ArgumentParser(
description='Whoogle Search console runner')
@ -626,6 +647,11 @@ def run_app() -> None:
default='',
metavar='</path/to/unix.sock>',
help='Listen for app on unix socket instead of host:port')
parser.add_argument(
'--unix-socket-perms',
default='600',
metavar='<octal permissions>',
help='Octal permissions to use for the Unix domain socket (default 600)')
parser.add_argument(
'--debug',
default=False,
@ -677,7 +703,7 @@ def run_app() -> None:
if args.debug:
app.run(host=args.host, port=args.port, debug=args.debug)
elif args.unix_socket:
waitress.serve(app, unix_socket=args.unix_socket)
waitress.serve(app, unix_socket=args.unix_socket, unix_socket_perms=args.unix_socket_perms)
else:
waitress.serve(
app,

@ -71,7 +71,7 @@ details summary span {
padding-right: 5px;
}
.sCuL3 {
.has-favicon .sCuL3 {
padding-left: 30px;
}

@ -20,21 +20,86 @@
</p>
<hr>
<p>
{% if blocked is defined %}
<h4><a class="link" href="https://farside.link">{{ translation['continue-search'] }}</a></h4>
Whoogle:
<br>
<a class="link-color" href="{{farside}}/whoogle/search?q={{query}}{{params}}">
{{farside}}/whoogle/search?q={{query}}
</a>
<br><br>
Searx:
<br>
<a class="link-color" href="{{farside}}/searx/search?q={{query}}">
{{farside}}/searx/search?q={{query}}
</a>
<ul>
<li>
<a href="https://github.com/benbusby/whoogle-search">Whoogle</a>
<ul>
<li>
<a class="link-color" href="{{farside}}/whoogle/search?q={{query}}{{params}}">
{{farside}}/whoogle/search?q={{query}}
</a>
</li>
</ul>
</li>
<li>
<a href="https://github.com/searxng/searxng">SearXNG</a>
<ul>
<li>
<a class="link-color" href="{{farside}}/searxng/search?q={{query}}">
{{farside}}/searxng/search?q={{query}}
</a>
</li>
</ul>
</li>
</ul>
<hr>
<h4>Other options:</h4>
<ul>
<li>
<a href="https://kagi.com">Kagi</a>
<ul>
<li>Recommended by Whoogle maintainer</li>
<li>Requires account</li>
<li>
<a class="link-color" href="https://kagi.com/search?q={{query}}">
kagi.com/search?q={{query}}
</a>
</li>
</ul>
</li>
<li>
<a href="https://duckduckgo.com">DuckDuckGo</a>
<ul>
<li>
<a class="link-color" href="https://duckduckgo.com/search?q={{query}}">
duckduckgo.com/search?q={{query}}
</a>
</li>
</ul>
</li>
<li>
<a href="https://search.brave.com">Brave Search</a>
<ul>
<li>
<a class="link-color" href="https://search.brave.com/search?q={{query}}">
search.brave.com/search?q={{query}}
</a>
</li>
</ul>
</li>
<li>
<a href="https://ecosia.com">Ecosia</a>
<ul>
<li>
<a class="link-color" href="https://ecosia.com/search?q={{query}}">
ecosia.com/search?q={{query}}
</a>
</li>
</ul>
</li>
<li>
<a href="https://google.com">Google</a>
<ul>
<li>
<a class="link-color" href="https://google.com/search?q={{query}}">
google.com/search?q={{query}}
</a>
</li>
</ul>
</li>
</ul>
<hr>
{% endif %}
</p>
<a class="link" href="home">Return Home</a>
</div>

@ -56,8 +56,8 @@ def gen_file_hash(path: str, static_file: str) -> str:
return filename_split[0] + '.' + file_hash + filename_split[-1]
def read_config_bool(var: str) -> bool:
val = os.getenv(var, '0')
def read_config_bool(var: str, default: bool=False) -> bool:
val = os.getenv(var, '1' if default else '0')
# user can specify one of the following values as 'true' inputs (all
# variants with upper case letters will also work):
# ('true', 't', '1', 'yes', 'y')

@ -12,7 +12,7 @@ import re
import warnings
SKIP_ARGS = ['ref_src', 'utm']
SKIP_PREFIX = ['//www.', '//mobile.', '//m.', 'www.', 'mobile.', 'm.']
SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
GOOG_STATIC = 'www.gstatic.com'
G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif'
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
@ -152,11 +152,12 @@ def get_first_link(soup: BeautifulSoup) -> str:
return ''
def get_site_alt(link: str) -> str:
def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
"""Returns an alternative to a particular site, if one is configured
Args:
link: A string result URL to check against the SITE_ALTS map
link: A string result URL to check against the site_alts map
site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS
Returns:
str: An updated (or ignored) result link
@ -178,9 +179,9 @@ def get_site_alt(link: str) -> str:
# "https://medium.com/..." should match, but "philomedium.com" should not)
hostcomp = f'{parsed_link.scheme}://{hostname}'
for site_key in SITE_ALTS.keys():
for site_key in site_alts.keys():
site_alt = f'{parsed_link.scheme}://{site_key}'
if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
if not hostname or site_alt not in hostcomp or not site_alts[site_key]:
continue
# Wikipedia -> Wikiless replacements require the subdomain (if it's
@ -193,9 +194,8 @@ def get_site_alt(link: str) -> str:
elif 'medium' in hostname and len(subdomain) > 0:
hostname = f'{subdomain}.{hostname}'
parsed_alt = urlparse.urlparse(SITE_ALTS[site_key])
link = link.replace(hostname, SITE_ALTS[site_key]) + params
parsed_alt = urlparse.urlparse(site_alts[site_key])
link = link.replace(hostname, site_alts[site_key]) + params
# If a scheme is specified in the alternative, this results in a
# replaced link that looks like "https://http://altservice.tld".
# In this case, we can remove the original scheme from the result
@ -205,9 +205,12 @@ def get_site_alt(link: str) -> str:
for prefix in SKIP_PREFIX:
if parsed_alt.scheme:
link = link.replace(prefix, '')
# If a scheme is specified, remove everything before the
# first occurence of it
link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}'
else:
link = link.replace(prefix, '//')
# Otherwise, replace the first occurrence of the prefix
link = link.replace(prefix, '//', 1)
break
return link

@ -4,4 +4,4 @@ optional_dev_tag = ''
if os.getenv('DEV_BUILD'):
optional_dev_tag = '.dev' + os.getenv('DEV_BUILD')
__version__ = '0.8.3' + optional_dev_tag
__version__ = '0.8.4' + optional_dev_tag

@ -3,7 +3,7 @@ name: whoogle
description: A self hosted search engine on Kubernetes
type: application
version: 0.1.0
appVersion: 0.8.3
appVersion: 0.8.4
icon: https://github.com/benbusby/whoogle-search/raw/main/app/static/img/favicon/favicon-96x96.png

@ -1,6 +1,7 @@
https://search.albony.xyz
https://search.garudalinux.org
https://search.dr460nf1r3.org
https://search.nezumi.party
https://s.tokhmi.xyz
https://search.sethforprivacy.com
https://whoogle.dcs0.hu
@ -15,7 +16,6 @@ https://whoogle2.ungovernable.men
https://whoogle3.ungovernable.men
https://wgl.frail.duckdns.org
https://whoogle.no-logs.com
https://search.rubberverse.xyz
https://whoogle.ftw.lol
https://whoogle-search--replitcomreside.repl.co
https://search.notrustverify.ch

@ -7,13 +7,13 @@ cffi==1.15.1
chardet==5.1.0
click==8.1.3
cryptography==3.3.2; platform_machine == 'armv7l'
cryptography==41.0.4; platform_machine != 'armv7l'
cryptography==41.0.6; platform_machine != 'armv7l'
cssutils==2.6.0
defusedxml==0.7.1
Flask==2.3.2
idna==3.4
itsdangerous==2.1.2
Jinja2==3.1.2
Jinja2==3.1.3
MarkupSafe==2.1.2
more-itertools==9.0.0
packaging==23.0
@ -29,9 +29,9 @@ python-dateutil==2.8.2
requests==2.31.0
soupsieve==2.4
stem==1.8.1
urllib3==1.26.17
urllib3==1.26.18
validators==0.22.0
waitress==2.1.2
wcwidth==0.2.6
Werkzeug==2.3.3
Werkzeug==3.0.1
python-dotenv==0.21.1

@ -27,6 +27,7 @@ install_requires=
python-dotenv
requests
stem
validators
waitress
[options.extras_require]

@ -2,6 +2,7 @@ from bs4 import BeautifulSoup
from app.filter import Filter
from app.models.config import Config
from app.models.endpoint import Endpoint
from app.utils import results
from app.utils.session import generate_key
from datetime import datetime
from dateutil.parser import ParserError, parse
@ -95,13 +96,13 @@ def test_view_my_ip(client):
def test_recent_results(client):
times = {
'past year': 365,
'past month': 31,
'past week': 7
'tbs=qdr:y': 365,
'tbs=qdr:m': 31,
'tbs=qdr:w': 7
}
for time, num_days in times.items():
rv = client.get(f'/{Endpoint.search}?q=test :' + time)
rv = client.get(f'/{Endpoint.search}?q=test&' + time)
result_divs = get_search_results(rv.data)
current_date = datetime.now()
@ -136,3 +137,22 @@ def test_leading_slash_search(client):
continue
assert link['href'].startswith(f'{Endpoint.search}')
def test_site_alt_prefix_skip():
# Ensure prefixes are skipped correctly for site alts
# default silte_alts (farside.link)
assert results.get_site_alt(link = 'https://www.reddit.com') == 'https://farside.link/libreddit'
assert results.get_site_alt(link = 'https://www.twitter.com') == 'https://farside.link/nitter'
assert results.get_site_alt(link = 'https://www.youtube.com') == 'https://farside.link/invidious'
test_site_alts = {
'reddit.com': 'reddit.endswithmobile.domain',
'twitter.com': 'https://twitter.endswithm.domain',
'youtube.com': 'http://yt.endswithwww.domain',
}
# Domains with part of SKIP_PREFIX in them
assert results.get_site_alt(link = 'https://www.reddit.com', site_alts = test_site_alts) == 'https://reddit.endswithmobile.domain'
assert results.get_site_alt(link = 'https://www.twitter.com', site_alts = test_site_alts) == 'https://twitter.endswithm.domain'
assert results.get_site_alt(link = 'https://www.youtube.com', site_alts = test_site_alts) == 'http://yt.endswithwww.domain'

Loading…
Cancel
Save