Merge branch 'benbusby:main' into main

3 months ago · bc78a658e6
parent 2e2f955f05 c216c033ef
commit bc78a658e6
13 changed files with 165 additions and 43 deletions
--- a/README.md
+++ b/README.md
@ -422,6 +422,8 @@ There are a few optional environment variables available for customizing a Whoog
 | WHOOGLE_TOR_SERVICE  | Enable/disable the Tor service on startup. Default on -- use '0' to disable.              |
 | WHOOGLE_TOR_USE_PASS | Use password authentication for tor control port. |
 | WHOOGLE_TOR_CONF | The absolute path to the config file containing the password for the tor control port. Default: ./misc/tor/control.conf WHOOGLE_TOR_PASS must be 1 for this to work.|
+| WHOOGLE_SHOW_FAVICONS | Show/hide favicons next to search result URLs. Default on.                               |
+| WHOOGLE_UPDATE_CHECK  | Enable/disable the automatic daily check for new versions of Whoogle. Default on.        |

 ### Config Environment Variables
 These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time.
@ -663,12 +665,12 @@ A lot of the app currently piggybacks on Google's existing support for fetching
 | [https://whoogle.lunar.icu](https://whoogle.lunar.icu) | 🇩🇪 DE | Multi-choice | ✅ |
 | [https://wgl.frail.duckdns.org](https://wgl.frail.duckdns.org) | 🇧🇷 BR | Multi-choice | |
 | [https://whoogle.no-logs.com](https://whoogle.no-logs.com/) | 🇸🇪 SE | Multi-choice | |
-| [https://search.rubberverse.xyz](https://search.rubberverse.xyz) | 🇵🇱 PL | English | |
 | [https://whoogle.ftw.lol](https://whoogle.ftw.lol) | 🇩🇪 DE | Multi-choice | |
 | [https://whoogle-search--replitcomreside.repl.co](https://whoogle-search--replitcomreside.repl.co) | 🇺🇸 US | English |  |
 | [https://search.notrustverify.ch](https://search.notrustverify.ch) | 🇨🇭 CH | Multi-choice |  |
 | [https://whoogle.datura.network](https://whoogle.datura.network) | 🇩🇪 DE | Multi-choice | |
 | [https://whoogle.yepserver.xyz](https://whoogle.yepserver.xyz) | 🇺🇦 UA | Multi-choice | |
+| [https://search.nezumi.party](https://search.nezumi.party) | 🇮🇹 IT | Multi-choice | |


 * A checkmark in the "Cloudflare" category here refers to the use of the reverse proxy, [Cloudflare](https://cloudflare.com). The checkmark will not be listed for a site which uses Cloudflare DNS but rather the proxying service which grants Cloudflare the ability to monitor traffic to the website.
--- a/app/filter.py
+++ b/app/filter.py
@ -29,9 +29,12 @@ unsupported_g_pages = [
    'google.com/preferences',
    'google.com/intl',
    'advanced_search',
-    'tbm=shop'
+    'tbm=shop',
+    'ageverification.google.co.kr'
 ]

+unsupported_g_divs = ['google.com/preferences?hl=', 'ageverification.google.co.kr']
+

 def extract_q(q_str: str, href: str) -> str:
    """Extracts the 'q' element from a result link. This is typically
@ -245,7 +248,9 @@ class Filter:
            None (The soup object is modified directly)
        """
        # Skip empty, parentless, or internal links
-        if not link or not link.parent or not link['href'].startswith('http'):
+        show_favicons = read_config_bool('WHOOGLE_SHOW_FAVICONS', True)
+        is_valid_link = link and link.parent and link['href'].startswith('http')
+        if not show_favicons or not is_valid_link:
            return

        parent = link.parent
@ -558,7 +563,7 @@ class Filter:
            link['href'] = link_netloc
            parent = link.parent

-            if 'google.com/preferences?hl=' in link_netloc:
+            if any(divlink in link_netloc for divlink in unsupported_g_divs):
                # Handle case where a search is performed in a different
                # language than what is configured. This usually returns a
                # div with the same classes as normal search results, but with
--- a/app/routes.py
+++ b/app/routes.py
@ -135,7 +135,8 @@ def before_request_func():

    # Check for latest version if needed
    now = datetime.now()
-    if now - timedelta(hours=24) > app.config['LAST_UPDATE_CHECK']:
+    needs_update_check = now - timedelta(hours=24) > app.config['LAST_UPDATE_CHECK']
+    if read_config_bool('WHOOGLE_UPDATE_CHECK', True) and needs_update_check:
        app.config['LAST_UPDATE_CHECK'] = now
        app.config['HAS_UPDATE'] = check_for_update(
            app.config['RELEASES_URL'],
@ -608,6 +609,26 @@ def page_not_found(e):
    return render_template('error.html', error_message=str(e)), 404


+@app.errorhandler(Exception)
+def internal_error(e):
+    query = ''
+    if request.method == 'POST':
+        query = request.form.get('q')
+    else:
+        query = request.args.get('q')
+
+    localization_lang = g.user_config.get_localization_lang()
+    translation = app.config['TRANSLATIONS'][localization_lang]
+    return render_template(
+            'error.html',
+            error_message='Internal server error (500)',
+            translation=translation,
+            farside='https://farside.link',
+            config=g.user_config,
+            query=urlparse.unquote(query),
+            params=g.user_config.to_params(keys=['preferences'])), 500
+
+
 def run_app() -> None:
    parser = argparse.ArgumentParser(
        description='Whoogle Search console runner')
@ -626,6 +647,11 @@ def run_app() -> None:
        default='',
        metavar='</path/to/unix.sock>',
        help='Listen for app on unix socket instead of host:port')
+    parser.add_argument(
+        '--unix-socket-perms',
+        default='600',
+        metavar='<octal permissions>',
+        help='Octal permissions to use for the Unix domain socket (default 600)')
    parser.add_argument(
        '--debug',
        default=False,
@ -677,7 +703,7 @@ def run_app() -> None:
    if args.debug:
        app.run(host=args.host, port=args.port, debug=args.debug)
    elif args.unix_socket:
-        waitress.serve(app, unix_socket=args.unix_socket)
+        waitress.serve(app, unix_socket=args.unix_socket, unix_socket_perms=args.unix_socket_perms)
    else:
        waitress.serve(
            app,
--- a/app/static/css/search.css
+++ b/app/static/css/search.css
@ -71,7 +71,7 @@ details summary span {
 	padding-right: 5px;
 }

-.sCuL3 {
+.has-favicon .sCuL3 {
 	padding-left: 30px;
 }

--- a/app/templates/error.html
+++ b/app/templates/error.html
@ -20,21 +20,86 @@
    </p>
    <hr>
    <p>
-    {% if blocked is defined %}
        <h4><a class="link" href="https://farside.link">{{ translation['continue-search'] }}</a></h4>
-        Whoogle:
-        <br>
-        <a class="link-color" href="{{farside}}/whoogle/search?q={{query}}{{params}}">
-            {{farside}}/whoogle/search?q={{query}}
-        </a>
-        <br><br>
-        Searx:
-        <br>
-        <a class="link-color" href="{{farside}}/searx/search?q={{query}}">
-            {{farside}}/searx/search?q={{query}}
-        </a>
+        <ul>
+            <li>
+                <a href="https://github.com/benbusby/whoogle-search">Whoogle</a>
+                <ul>
+                    <li>
+                        <a class="link-color" href="{{farside}}/whoogle/search?q={{query}}{{params}}">
+                            {{farside}}/whoogle/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+            <li>
+                <a href="https://github.com/searxng/searxng">SearXNG</a>
+                <ul>
+                    <li>
+                        <a class="link-color" href="{{farside}}/searxng/search?q={{query}}">
+                            {{farside}}/searxng/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+        </ul>
+        <hr>
+        <h4>Other options:</h4>
+        <ul>
+            <li>
+                <a href="https://kagi.com">Kagi</a>
+                <ul>
+                    <li>Recommended by Whoogle maintainer</li>
+                    <li>Requires account</li>
+                    <li>
+                        <a class="link-color" href="https://kagi.com/search?q={{query}}">
+                            kagi.com/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+            <li>
+                <a href="https://duckduckgo.com">DuckDuckGo</a>
+                <ul>
+                    <li>
+                        <a class="link-color" href="https://duckduckgo.com/search?q={{query}}">
+                            duckduckgo.com/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+            <li>
+                <a href="https://search.brave.com">Brave Search</a>
+                <ul>
+                    <li>
+                        <a class="link-color" href="https://search.brave.com/search?q={{query}}">
+                            search.brave.com/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+            <li>
+                <a href="https://ecosia.com">Ecosia</a>
+                <ul>
+                    <li>
+                        <a class="link-color" href="https://ecosia.com/search?q={{query}}">
+                            ecosia.com/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+            <li>
+                <a href="https://google.com">Google</a>
+                <ul>
+                    <li>
+                        <a class="link-color" href="https://google.com/search?q={{query}}">
+                            google.com/search?q={{query}}
+                        </a>
+                    </li>
+                </ul>
+            </li>
+        </ul>
        <hr>
-    {% endif %}
    </p>
    <a class="link" href="home">Return Home</a>
 </div>
--- a/app/utils/misc.py
+++ b/app/utils/misc.py
@ -56,8 +56,8 @@ def gen_file_hash(path: str, static_file: str) -> str:
    return filename_split[0] + '.' + file_hash + filename_split[-1]


-def read_config_bool(var: str) -> bool:
-    val = os.getenv(var, '0')
+def read_config_bool(var: str, default: bool=False) -> bool:
+    val = os.getenv(var, '1' if default else '0')
    # user can specify one of the following values as 'true' inputs (all
    # variants with upper case letters will also work):
    # ('true', 't', '1', 'yes', 'y')
--- a/app/utils/results.py
+++ b/app/utils/results.py
@ -12,7 +12,7 @@ import re
 import warnings

 SKIP_ARGS = ['ref_src', 'utm']
-SKIP_PREFIX = ['//www.', '//mobile.', '//m.', 'www.', 'mobile.', 'm.']
+SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
 GOOG_STATIC = 'www.gstatic.com'
 G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif'
 GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
@ -152,11 +152,12 @@ def get_first_link(soup: BeautifulSoup) -> str:
    return ''


-def get_site_alt(link: str) -> str:
+def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
    """Returns an alternative to a particular site, if one is configured

    Args:
-        link: A string result URL to check against the SITE_ALTS map
+        link: A string result URL to check against the site_alts map
+        site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS

    Returns:
        str: An updated (or ignored) result link
@ -178,9 +179,9 @@ def get_site_alt(link: str) -> str:
    # "https://medium.com/..." should match, but "philomedium.com" should not)
    hostcomp = f'{parsed_link.scheme}://{hostname}'

-    for site_key in SITE_ALTS.keys():
+    for site_key in site_alts.keys():
        site_alt = f'{parsed_link.scheme}://{site_key}'
-        if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
+        if not hostname or site_alt not in hostcomp or not site_alts[site_key]:
            continue

        # Wikipedia -> Wikiless replacements require the subdomain (if it's
@ -193,9 +194,8 @@ def get_site_alt(link: str) -> str:
        elif 'medium' in hostname and len(subdomain) > 0:
            hostname = f'{subdomain}.{hostname}'

-        parsed_alt = urlparse.urlparse(SITE_ALTS[site_key])
-        link = link.replace(hostname, SITE_ALTS[site_key]) + params
-
+        parsed_alt = urlparse.urlparse(site_alts[site_key])
+        link = link.replace(hostname, site_alts[site_key]) + params
        # If a scheme is specified in the alternative, this results in a
        # replaced link that looks like "https://http://altservice.tld".
        # In this case, we can remove the original scheme from the result
@ -205,9 +205,12 @@ def get_site_alt(link: str) -> str:

        for prefix in SKIP_PREFIX:
            if parsed_alt.scheme:
-                link = link.replace(prefix, '')
+                # If a scheme is specified, remove everything before the
+                # first occurence of it
+                link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}'
            else:
-                link = link.replace(prefix, '//')
+                # Otherwise, replace the first occurrence of the prefix
+                link = link.replace(prefix, '//', 1)
        break

    return link
--- a/app/version.py
+++ b/app/version.py
@ -4,4 +4,4 @@ optional_dev_tag = ''
 if os.getenv('DEV_BUILD'):
    optional_dev_tag = '.dev' + os.getenv('DEV_BUILD')

-__version__ = '0.8.3' + optional_dev_tag
+__version__ = '0.8.4' + optional_dev_tag
--- a/charts/whoogle/Chart.yaml
+++ b/charts/whoogle/Chart.yaml
@ -3,7 +3,7 @@ name: whoogle
 description: A self hosted search engine on Kubernetes
 type: application
 version: 0.1.0
-appVersion: 0.8.3
+appVersion: 0.8.4

 icon: https://github.com/benbusby/whoogle-search/raw/main/app/static/img/favicon/favicon-96x96.png

--- a/misc/instances.txt
+++ b/misc/instances.txt
@ -1,6 +1,7 @@
 https://search.albony.xyz
 https://search.garudalinux.org
 https://search.dr460nf1r3.org
+https://search.nezumi.party
 https://s.tokhmi.xyz
 https://search.sethforprivacy.com
 https://whoogle.dcs0.hu
@ -15,7 +16,6 @@ https://whoogle2.ungovernable.men
 https://whoogle3.ungovernable.men
 https://wgl.frail.duckdns.org
 https://whoogle.no-logs.com
-https://search.rubberverse.xyz
 https://whoogle.ftw.lol
 https://whoogle-search--replitcomreside.repl.co
 https://search.notrustverify.ch
--- a/requirements.txt
+++ b/requirements.txt
@ -7,13 +7,13 @@ cffi==1.15.1
 chardet==5.1.0
 click==8.1.3
 cryptography==3.3.2; platform_machine == 'armv7l'
-cryptography==41.0.4; platform_machine != 'armv7l'
+cryptography==41.0.6; platform_machine != 'armv7l'
 cssutils==2.6.0
 defusedxml==0.7.1
 Flask==2.3.2
 idna==3.4
 itsdangerous==2.1.2
-Jinja2==3.1.2
+Jinja2==3.1.3
 MarkupSafe==2.1.2
 more-itertools==9.0.0
 packaging==23.0
@ -29,9 +29,9 @@ python-dateutil==2.8.2
 requests==2.31.0
 soupsieve==2.4
 stem==1.8.1
-urllib3==1.26.17
+urllib3==1.26.18
 validators==0.22.0
 waitress==2.1.2
 wcwidth==0.2.6
-Werkzeug==2.3.3
+Werkzeug==3.0.1
 python-dotenv==0.21.1
--- a/setup.cfg
+++ b/setup.cfg
@ -27,6 +27,7 @@ install_requires=
    python-dotenv
    requests
    stem
+    validators
    waitress

 [options.extras_require]
--- a/test/test_results.py
+++ b/test/test_results.py
@ -2,6 +2,7 @@ from bs4 import BeautifulSoup
 from app.filter import Filter
 from app.models.config import Config
 from app.models.endpoint import Endpoint
+from app.utils import results
 from app.utils.session import generate_key
 from datetime import datetime
 from dateutil.parser import ParserError, parse
@ -95,13 +96,13 @@ def test_view_my_ip(client):

 def test_recent_results(client):
    times = {
-        'past year': 365,
-        'past month': 31,
-        'past week': 7
+        'tbs=qdr:y': 365,
+        'tbs=qdr:m': 31,
+        'tbs=qdr:w': 7
    }

    for time, num_days in times.items():
-        rv = client.get(f'/{Endpoint.search}?q=test :' + time)
+        rv = client.get(f'/{Endpoint.search}?q=test&' + time)
        result_divs = get_search_results(rv.data)

        current_date = datetime.now()
@ -136,3 +137,22 @@ def test_leading_slash_search(client):
            continue

        assert link['href'].startswith(f'{Endpoint.search}')
+
+
+def test_site_alt_prefix_skip():
+    # Ensure prefixes are skipped correctly for site alts
+
+    # default silte_alts (farside.link)
+    assert results.get_site_alt(link = 'https://www.reddit.com') == 'https://farside.link/libreddit'
+    assert results.get_site_alt(link = 'https://www.twitter.com') == 'https://farside.link/nitter'
+    assert results.get_site_alt(link = 'https://www.youtube.com') == 'https://farside.link/invidious'
+
+    test_site_alts = {
+    'reddit.com': 'reddit.endswithmobile.domain',
+    'twitter.com': 'https://twitter.endswithm.domain',
+    'youtube.com': 'http://yt.endswithwww.domain',
+    }
+    # Domains with part of SKIP_PREFIX in them
+    assert results.get_site_alt(link = 'https://www.reddit.com', site_alts = test_site_alts) == 'https://reddit.endswithmobile.domain'
+    assert results.get_site_alt(link = 'https://www.twitter.com', site_alts = test_site_alts) == 'https://twitter.endswithm.domain'
+    assert results.get_site_alt(link = 'https://www.youtube.com', site_alts = test_site_alts) == 'http://yt.endswithwww.domain'