From 9cc1004fb8e4b3678226bca675bb4fd47c359151 Mon Sep 17 00:00:00 2001 From: Gautam Korlam Date: Wed, 1 Nov 2023 13:07:45 -0700 Subject: [PATCH] fix: correctly handle skip_prefix logic for site_alts (#1092) Fixes #1091 --- app/utils/results.py | 23 +++++++++++++---------- test/test_results.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/app/utils/results.py b/app/utils/results.py index 42654e9..c78f866 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -12,7 +12,7 @@ import re import warnings SKIP_ARGS = ['ref_src', 'utm'] -SKIP_PREFIX = ['//www.', '//mobile.', '//m.', 'www.', 'mobile.', 'm.'] +SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] GOOG_STATIC = 'www.gstatic.com' G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif' GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' @@ -152,11 +152,12 @@ def get_first_link(soup: BeautifulSoup) -> str: return '' -def get_site_alt(link: str) -> str: +def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str: """Returns an alternative to a particular site, if one is configured Args: - link: A string result URL to check against the SITE_ALTS map + link: A string result URL to check against the site_alts map + site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS Returns: str: An updated (or ignored) result link @@ -178,9 +179,9 @@ def get_site_alt(link: str) -> str: # "https://medium.com/..." should match, but "philomedium.com" should not) hostcomp = f'{parsed_link.scheme}://{hostname}' - for site_key in SITE_ALTS.keys(): + for site_key in site_alts.keys(): site_alt = f'{parsed_link.scheme}://{site_key}' - if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]: + if not hostname or site_alt not in hostcomp or not site_alts[site_key]: continue # Wikipedia -> Wikiless replacements require the subdomain (if it's @@ -193,9 +194,8 @@ def get_site_alt(link: str) -> str: elif 'medium' in hostname and len(subdomain) > 0: hostname = f'{subdomain}.{hostname}' - parsed_alt = urlparse.urlparse(SITE_ALTS[site_key]) - link = link.replace(hostname, SITE_ALTS[site_key]) + params - + parsed_alt = urlparse.urlparse(site_alts[site_key]) + link = link.replace(hostname, site_alts[site_key]) + params # If a scheme is specified in the alternative, this results in a # replaced link that looks like "https://http://altservice.tld". # In this case, we can remove the original scheme from the result @@ -205,9 +205,12 @@ def get_site_alt(link: str) -> str: for prefix in SKIP_PREFIX: if parsed_alt.scheme: - link = link.replace(prefix, '') + # If a scheme is specified, remove everything before the + # first occurence of it + link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}' else: - link = link.replace(prefix, '//') + # Otherwise, replace the first occurrence of the prefix + link = link.replace(prefix, '//', 1) break return link diff --git a/test/test_results.py b/test/test_results.py index 63ae159..64caacd 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -2,6 +2,7 @@ from bs4 import BeautifulSoup from app.filter import Filter from app.models.config import Config from app.models.endpoint import Endpoint +from app.utils import results from app.utils.session import generate_key from datetime import datetime from dateutil.parser import ParserError, parse @@ -136,3 +137,22 @@ def test_leading_slash_search(client): continue assert link['href'].startswith(f'{Endpoint.search}') + + +def test_site_alt_prefix_skip(): + # Ensure prefixes are skipped correctly for site alts + + # default silte_alts (farside.link) + assert results.get_site_alt(link = 'https://www.reddit.com') == 'https://farside.link/libreddit' + assert results.get_site_alt(link = 'https://www.twitter.com') == 'https://farside.link/nitter' + assert results.get_site_alt(link = 'https://www.youtube.com') == 'https://farside.link/invidious' + + test_site_alts = { + 'reddit.com': 'reddit.endswithmobile.domain', + 'twitter.com': 'https://twitter.endswithm.domain', + 'youtube.com': 'http://yt.endswithwww.domain', + } + # Domains with part of SKIP_PREFIX in them + assert results.get_site_alt(link = 'https://www.reddit.com', site_alts = test_site_alts) == 'https://reddit.endswithmobile.domain' + assert results.get_site_alt(link = 'https://www.twitter.com', site_alts = test_site_alts) == 'https://twitter.endswithm.domain' + assert results.get_site_alt(link = 'https://www.youtube.com', site_alts = test_site_alts) == 'http://yt.endswithwww.domain'