From 4eaa0dd27580453a309185dd635ef6146ead63d4 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Tue, 2 Jul 2024 20:43:23 +0200 Subject: [PATCH] [fix] gentoo: use mediawiki engine --- searx/engines/gentoo.py | 125 ------------------------------------- searx/engines/mediawiki.py | 13 ++-- searx/settings.yml | 8 ++- 3 files changed, 13 insertions(+), 133 deletions(-) delete mode 100644 searx/engines/gentoo.py diff --git a/searx/engines/gentoo.py b/searx/engines/gentoo.py deleted file mode 100644 index 4a4e085ba..000000000 --- a/searx/engines/gentoo.py +++ /dev/null @@ -1,125 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Gentoo Wiki -""" - -from urllib.parse import urlencode, urljoin -from lxml import html -from searx.utils import extract_text - -# about -about = { - "website": 'https://wiki.gentoo.org/', - "wikidata_id": 'Q1050637', - "official_api_documentation": 'https://wiki.gentoo.org/api.php', - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['it', 'software wikis'] -paging = True -base_url = 'https://wiki.gentoo.org' - -# xpath queries -xpath_results = '//ul[@class="mw-search-results"]/li' -xpath_link = './/div[@class="mw-search-result-heading"]/a' -xpath_content = './/div[@class="searchresult"]' - - -# cut 'en' from 'en-US', 'de' from 'de-CH', and so on -def locale_to_lang_code(locale): - if locale.find('-') >= 0: - locale = locale.split('-')[0] - return locale - - -# wikis for some languages were moved off from the main site, we need to make -# requests to correct URLs to be able to get results in those languages -lang_urls = { - 'en': {'base': 'https://wiki.gentoo.org', 'search': '/index.php?title=Special:Search&offset={offset}&{query}'}, - 'others': { - 'base': 'https://wiki.gentoo.org', - 'search': '/index.php?title=Special:Search&offset={offset}&{query}\ - &profile=translation&languagefilter={language}', - }, -} - - -# get base & search URLs for selected language -def get_lang_urls(language): - if language != 'en': - return lang_urls['others'] - return lang_urls['en'] - - -# Language names to build search requests for -# those languages which are hosted on the main site. -main_langs = { - 'ar': 'العربية', - 'bg': 'Български', - 'cs': 'Česky', - 'da': 'Dansk', - 'el': 'Ελληνικά', - 'es': 'Español', - 'he': 'עברית', - 'hr': 'Hrvatski', - 'hu': 'Magyar', - 'it': 'Italiano', - 'ko': '한국어', - 'lt': 'Lietuviškai', - 'nl': 'Nederlands', - 'pl': 'Polski', - 'pt': 'Português', - 'ru': 'Русский', - 'sl': 'Slovenský', - 'th': 'ไทย', - 'uk': 'Українська', - 'zh': '简体中文', -} - - -# do search-request -def request(query, params): - # translate the locale (e.g. 'en-US') to language code ('en') - language = locale_to_lang_code(params['language']) - - # if our language is hosted on the main site, we need to add its name - # to the query in order to narrow the results to that language - if language in main_langs: - query += ' (' + main_langs[language] + ')' - - # prepare the request parameters - query = urlencode({'search': query}) - offset = (params['pageno'] - 1) * 20 - - # get request URLs for our language of choice - urls = get_lang_urls(language) - search_url = urls['base'] + urls['search'] - - params['url'] = search_url.format(query=query, offset=offset, language=language) - - return params - - -# get response from search-request -def response(resp): - # get the base URL for the language in which request was made - language = locale_to_lang_code(resp.search_params['language']) - url = get_lang_urls(language)['base'] - - results = [] - - dom = html.fromstring(resp.text) - - # parse results - for result in dom.xpath(xpath_results): - link = result.xpath(xpath_link)[0] - href = urljoin(url, link.attrib.get('href')) - title = extract_text(link) - content = extract_text(result.xpath(xpath_content)) - - results.append({'url': href, 'title': title, 'content': content}) - - return results diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 76317402e..81d0c37aa 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -100,6 +100,12 @@ base_url: str = 'https://{language}.wikipedia.org/' ISO 639-1 language code (en, de, fr ..) of the search language. """ +api_path: str = 'w/api.php' +"""The path the PHP api is listening on. + +The default path should work fine usually. +""" + timestamp_format = '%Y-%m-%dT%H:%M:%SZ' """The longhand version of MediaWiki time strings.""" @@ -113,12 +119,7 @@ def request(query, params): else: params['language'] = params['language'].split('-')[0] - if base_url.endswith('/'): - api_url = base_url + 'w/api.php?' - else: - api_url = base_url + '/w/api.php?' - api_url = api_url.format(language=params['language']) - + api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language']) offset = (params['pageno'] - 1) * number_of_results args = { diff --git a/searx/settings.yml b/searx/settings.yml index 9e23a2615..748e5ac0e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -797,9 +797,13 @@ engines: shortcut: gen - name: gentoo - engine: gentoo + engine: mediawiki shortcut: ge - timeout: 10.0 + categories: ["it", "software wikis"] + base_url: "https://wiki.gentoo.org/" + api_path: "api.php" + search_type: text + timeout: 10 - name: gitlab engine: json_engine