From 3bec04079c027b952dee95dab194f29ea12e12a5 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Sun, 5 May 2024 20:43:45 +0200 Subject: [PATCH] [feat] hostname replace plugin: possibility to prioritize certain websites Co-authored-by: Markus Heiser --- docs/admin/installation-searxng.rst | 4 +- docs/dev/search_api.rst | 4 +- searx/plugins/hostname_replace.py | 49 ---------------- searx/plugins/hostnames.py | 73 ++++++++++++++++++++++++ searx/results.py | 21 ++++--- searx/settings.yml | 26 +++++---- utils/templates/etc/searxng/settings.yml | 16 ++++-- 7 files changed, 117 insertions(+), 76 deletions(-) delete mode 100644 searx/plugins/hostname_replace.py create mode 100644 searx/plugins/hostnames.py diff --git a/docs/admin/installation-searxng.rst b/docs/admin/installation-searxng.rst index 7bb936f15..446f45cd2 100644 --- a/docs/admin/installation-searxng.rst +++ b/docs/admin/installation-searxng.rst @@ -96,7 +96,7 @@ Modify the ``/etc/searxng/settings.yml`` to your needs: .. literalinclude:: ../../utils/templates/etc/searxng/settings.yml :language: yaml - :end-before: # hostname_replace: + :end-before: # hostnames: To see the entire file jump to :origin:`utils/templates/etc/searxng/settings.yml` @@ -104,7 +104,7 @@ Modify the ``/etc/searxng/settings.yml`` to your needs: .. literalinclude:: ../../searx/settings.yml :language: yaml - :end-before: # hostname_replace: + :end-before: # hostnames: To see the entire file jump to :origin:`searx/settings.yml` diff --git a/docs/dev/search_api.rst b/docs/dev/search_api.rst index 56272d341..aa5f847ea 100644 --- a/docs/dev/search_api.rst +++ b/docs/dev/search_api.rst @@ -103,14 +103,14 @@ Parameters .. disabled by default - ``Hostname_replace``, ``Open_Access_DOI_rewrite``, + ``Hostnames_plugin``, ``Open_Access_DOI_rewrite``, ``Vim-like_hotkeys``, ``Tor_check_plugin`` ``disabled_plugins``: optional List of disabled plugins. :default: - ``Hostname_replace``, ``Open_Access_DOI_rewrite``, + ``Hostnames_plugin``, ``Open_Access_DOI_rewrite``, ``Vim-like_hotkeys``, ``Tor_check_plugin`` :values: diff --git a/searx/plugins/hostname_replace.py b/searx/plugins/hostname_replace.py deleted file mode 100644 index 1b3f8609c..000000000 --- a/searx/plugins/hostname_replace.py +++ /dev/null @@ -1,49 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring - -import re -from urllib.parse import urlunparse, urlparse - -from flask_babel import gettext - -from searx import settings -from searx.plugins import logger - -name = gettext('Hostname replace') -description = gettext('Rewrite result hostnames or remove results based on the hostname') -default_on = False -preference_section = 'general' - -plugin_id = 'hostname_replace' - -replacements = {re.compile(p): r for (p, r) in settings[plugin_id].items()} if plugin_id in settings else {} - -logger = logger.getChild(plugin_id) -parsed = 'parsed_url' -_url_fields = ['iframe_src', 'audio_src'] - - -def on_result(_request, _search, result): - - for pattern, replacement in replacements.items(): - - if parsed in result: - if pattern.search(result[parsed].netloc): - # to keep or remove this result from the result list depends - # (only) on the 'parsed_url' - if not replacement: - return False - result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc)) - result['url'] = urlunparse(result[parsed]) - - for url_field in _url_fields: - if result.get(url_field): - url_src = urlparse(result[url_field]) - if pattern.search(url_src.netloc): - if not replacement: - del result[url_field] - else: - url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc)) - result[url_field] = urlunparse(url_src) - - return True diff --git a/searx/plugins/hostnames.py b/searx/plugins/hostnames.py new file mode 100644 index 000000000..515a45259 --- /dev/null +++ b/searx/plugins/hostnames.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring + +import re +from urllib.parse import urlunparse, urlparse + +from flask_babel import gettext + +from searx import settings +from searx.plugins import logger + +name = gettext('Hostnames plugin') +description = gettext('Rewrite hostnames, remove results or prioritize them based on the hostname') +default_on = False +preference_section = 'general' + +plugin_id = 'hostnames' + +replacements = { + re.compile(p): r + for (p, r) in (settings.get(plugin_id, {}).get('replace', settings.get('hostname_replace', {})).items()) +} +removables = {re.compile(p) for p in settings[plugin_id].get('remove', [])} +high_priority = {re.compile(p) for p in settings[plugin_id].get('high_priority', [])} +low_priority = {re.compile(p) for p in settings[plugin_id].get('low_priority', [])} + +logger = logger.getChild(plugin_id) +parsed = 'parsed_url' +_url_fields = ['iframe_src', 'audio_src'] + + +def _matches_parsed_url(result, pattern): + return parsed in result and pattern.search(result[parsed].netloc) + + +def on_result(_request, _search, result): + for pattern, replacement in replacements.items(): + if _matches_parsed_url(result, pattern): + logger.debug(result['url']) + result[parsed] = result[parsed]._replace(netloc=pattern.sub(replacement, result[parsed].netloc)) + result['url'] = urlunparse(result[parsed]) + logger.debug(result['url']) + + for url_field in _url_fields: + if not result.get(url_field): + continue + + url_src = urlparse(result[url_field]) + if pattern.search(url_src.netloc): + url_src = url_src._replace(netloc=pattern.sub(replacement, url_src.netloc)) + result[url_field] = urlunparse(url_src) + + for pattern in removables: + if _matches_parsed_url(result, pattern): + return False + + for url_field in _url_fields: + if not result.get(url_field): + continue + + url_src = urlparse(result[url_field]) + if pattern.search(url_src.netloc): + del result[url_field] + + for pattern in low_priority: + if _matches_parsed_url(result, pattern): + result['priority'] = 'low' + + for pattern in high_priority: + if _matches_parsed_url(result, pattern): + result['priority'] = 'high' + + return True diff --git a/searx/results.py b/searx/results.py index 8b248f8e0..b846eb275 100644 --- a/searx/results.py +++ b/searx/results.py @@ -130,16 +130,25 @@ def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branche infobox1['content'] = content2 -def result_score(result): +def result_score(result, priority): weight = 1.0 for result_engine in result['engines']: if hasattr(engines[result_engine], 'weight'): weight *= float(engines[result_engine].weight) - occurrences = len(result['positions']) + weight *= len(result['positions']) + score = 0 - return sum((occurrences * weight) / position for position in result['positions']) + for position in result['positions']: + if priority == 'low': + continue + if priority == 'high': + score += weight + else: + score += weight / position + + return score class Timing(NamedTuple): # pylint: disable=missing-class-docstring @@ -354,9 +363,7 @@ class ResultContainer: self._closed = True for result in self._merged_results: - score = result_score(result) - result['score'] = score - + result['score'] = result_score(result, result.get('priority')) # removing html content and whitespace duplications if result.get('content'): result['content'] = utils.html_to_text(result['content']).strip() @@ -364,7 +371,7 @@ class ResultContainer: result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split()) for result_engine in result['engines']: - counter_add(score, 'engine', result_engine, 'score') + counter_add(result['score'], 'engine', result_engine, 'score') results = sorted(self._merged_results, key=itemgetter('score'), reverse=True) diff --git a/searx/settings.yml b/searx/settings.yml index 46e34698a..8a1c00ba8 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -219,7 +219,7 @@ outgoing: # - 'Tracker URL remover' # - 'Ahmia blacklist' # activation depends on outgoing.using_tor_proxy # # these plugins are disabled if nothing is configured .. -# - 'Hostname replace' # see hostname_replace configuration below +# - 'Hostnames plugin' # see 'hostnames' configuration below # - 'Basic Calculator' # - 'Open Access DOI rewrite' # - 'Tor check plugin' @@ -228,17 +228,21 @@ outgoing: # # preferences if they want. # - 'Autodetect search language' -# Configuration of the "Hostname replace" plugin: +# Configuration of the "Hostnames plugin": # -# hostname_replace: -# '(.*\.)?youtube\.com$': 'invidious.example.com' -# '(.*\.)?youtu\.be$': 'invidious.example.com' -# '(.*\.)?youtube-noocookie\.com$': 'yotter.example.com' -# '(.*\.)?reddit\.com$': 'teddit.example.com' -# '(.*\.)?redd\.it$': 'teddit.example.com' -# '(www\.)?twitter\.com$': 'nitter.example.com' -# # to remove matching host names from result list, set value to false -# 'spam\.example\.com': false +# hostnames: +# replace: +# '(.*\.)?youtube\.com$': 'invidious.example.com' +# '(.*\.)?youtu\.be$': 'invidious.example.com' +# '(.*\.)?reddit\.com$': 'teddit.example.com' +# '(.*\.)?redd\.it$': 'teddit.example.com' +# '(www\.)?twitter\.com$': 'nitter.example.com' +# remove: +# - '(.*\.)?facebook.com$' +# low_priority: +# - '(.*\.)?google(\..*)?$' +# high_priority: +# - '(.*\.)?wikipedia.org$' checker: # disable checker when in debug mode diff --git a/utils/templates/etc/searxng/settings.yml b/utils/templates/etc/searxng/settings.yml index b37dc2fb6..fbd935b0a 100644 --- a/utils/templates/etc/searxng/settings.yml +++ b/utils/templates/etc/searxng/settings.yml @@ -36,16 +36,22 @@ enabled_plugins: - 'Self Informations' - 'Tracker URL remover' - 'Ahmia blacklist' - # - 'Hostname replace' # see hostname_replace configuration below + # - 'Hostnames plugin' # see 'hostnames' configuration below # - 'Open Access DOI rewrite' # plugins: # - only_show_green_results -# hostname_replace: -# -# # twitter --> nitter -# '(www\.)?twitter\.com$': 'nitter.net' +# hostnames: +# replace: +# '(.*\.)?youtube\.com$': 'invidious.example.com' +# '(.*\.)?youtu\.be$': 'invidious.example.com' +# remove: +# - '(.*\.)?facebook.com$' +# low_priority: +# - '(.*\.)?google\.com$' +# high_priority: +# - '(.*\.)?wikipedia.org$' engines: