From 9cc1004fb8e4b3678226bca675bb4fd47c359151 Mon Sep 17 00:00:00 2001
From: Gautam Korlam <kageiit@users.noreply.github.com>
Date: Wed, 1 Nov 2023 13:07:45 -0700
Subject: [PATCH] fix: correctly handle skip_prefix logic for site_alts (#1092)

Fixes #1091
---
 app/utils/results.py | 23 +++++++++++++----------
 test/test_results.py | 20 ++++++++++++++++++++
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/app/utils/results.py b/app/utils/results.py
index 42654e9..c78f866 100644
--- a/app/utils/results.py
+++ b/app/utils/results.py
@@ -12,7 +12,7 @@ import re
 import warnings
 
 SKIP_ARGS = ['ref_src', 'utm']
-SKIP_PREFIX = ['//www.', '//mobile.', '//m.', 'www.', 'mobile.', 'm.']
+SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
 GOOG_STATIC = 'www.gstatic.com'
 G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif'
 GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
@@ -152,11 +152,12 @@ def get_first_link(soup: BeautifulSoup) -> str:
     return ''
 
 
-def get_site_alt(link: str) -> str:
+def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
     """Returns an alternative to a particular site, if one is configured
 
     Args:
-        link: A string result URL to check against the SITE_ALTS map
+        link: A string result URL to check against the site_alts map
+        site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS
 
     Returns:
         str: An updated (or ignored) result link
@@ -178,9 +179,9 @@ def get_site_alt(link: str) -> str:
     # "https://medium.com/..." should match, but "philomedium.com" should not)
     hostcomp = f'{parsed_link.scheme}://{hostname}'
 
-    for site_key in SITE_ALTS.keys():
+    for site_key in site_alts.keys():
         site_alt = f'{parsed_link.scheme}://{site_key}'
-        if not hostname or site_alt not in hostcomp or not SITE_ALTS[site_key]:
+        if not hostname or site_alt not in hostcomp or not site_alts[site_key]:
             continue
 
         # Wikipedia -> Wikiless replacements require the subdomain (if it's
@@ -193,9 +194,8 @@ def get_site_alt(link: str) -> str:
         elif 'medium' in hostname and len(subdomain) > 0:
             hostname = f'{subdomain}.{hostname}'
 
-        parsed_alt = urlparse.urlparse(SITE_ALTS[site_key])
-        link = link.replace(hostname, SITE_ALTS[site_key]) + params
-
+        parsed_alt = urlparse.urlparse(site_alts[site_key])
+        link = link.replace(hostname, site_alts[site_key]) + params
         # If a scheme is specified in the alternative, this results in a
         # replaced link that looks like "https://http://altservice.tld".
         # In this case, we can remove the original scheme from the result
@@ -205,9 +205,12 @@ def get_site_alt(link: str) -> str:
 
         for prefix in SKIP_PREFIX:
             if parsed_alt.scheme:
-                link = link.replace(prefix, '')
+                # If a scheme is specified, remove everything before the
+                # first occurence of it
+                link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}'
             else:
-                link = link.replace(prefix, '//')
+                # Otherwise, replace the first occurrence of the prefix
+                link = link.replace(prefix, '//', 1)
         break
 
     return link
diff --git a/test/test_results.py b/test/test_results.py
index 63ae159..64caacd 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup
 from app.filter import Filter
 from app.models.config import Config
 from app.models.endpoint import Endpoint
+from app.utils import results
 from app.utils.session import generate_key
 from datetime import datetime
 from dateutil.parser import ParserError, parse
@@ -136,3 +137,22 @@ def test_leading_slash_search(client):
             continue
 
         assert link['href'].startswith(f'{Endpoint.search}')
+
+
+def test_site_alt_prefix_skip():
+    # Ensure prefixes are skipped correctly for site alts
+
+    # default silte_alts (farside.link)
+    assert results.get_site_alt(link = 'https://www.reddit.com') == 'https://farside.link/libreddit'
+    assert results.get_site_alt(link = 'https://www.twitter.com') == 'https://farside.link/nitter'
+    assert results.get_site_alt(link = 'https://www.youtube.com') == 'https://farside.link/invidious'
+
+    test_site_alts = {
+    'reddit.com': 'reddit.endswithmobile.domain',
+    'twitter.com': 'https://twitter.endswithm.domain',
+    'youtube.com': 'http://yt.endswithwww.domain',
+    }
+    # Domains with part of SKIP_PREFIX in them
+    assert results.get_site_alt(link = 'https://www.reddit.com', site_alts = test_site_alts) == 'https://reddit.endswithmobile.domain'
+    assert results.get_site_alt(link = 'https://www.twitter.com', site_alts = test_site_alts) == 'https://twitter.endswithm.domain'
+    assert results.get_site_alt(link = 'https://www.youtube.com', site_alts = test_site_alts) == 'http://yt.endswithwww.domain'