From e5637fe7b98d7fb06cbbe0e0f24deb12a33187ba Mon Sep 17 00:00:00 2001 From: Paolo Basso <12545838+paolobasso99@users.noreply.github.com> Date: Sun, 25 Jun 2023 17:12:17 +0200 Subject: [PATCH 1/5] [feat] engine: implementation of Anna's Archive Anna's Archive [1] is a free non-profit online shadow library metasearch engine providing access to a variety of book resources (also via IPFS), created by a team of anonymous archivists [2]. [1] https://annas-archive.org/ [2] https://annas-software.org/AnnaArchivist/annas-archive --- searx/engines/annas-archive.py | 63 ++++++++++++++++++++++++++++++++++ searx/settings.yml | 7 ++++ 2 files changed, 70 insertions(+) create mode 100644 searx/engines/annas-archive.py diff --git a/searx/engines/annas-archive.py b/searx/engines/annas-archive.py new file mode 100644 index 000000000..56d1ca77a --- /dev/null +++ b/searx/engines/annas-archive.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Anna's Archive + +""" +from typing import List, Dict, Any, Optional +from urllib.parse import quote +from lxml import html + +from searx.utils import extract_text, eval_xpath + +# about +about: Dict[str, Any] = { + "website": "https://annas-archive.org/", + "wikidata_id": "Q115288326", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# engine dependent config +categories: List[str] = ["files"] +paging: bool = False + +# search-url +base_url: str = "https://annas-archive.org" + +# xpath queries +xpath_results: str = '//main//a[starts-with(@href,"/md5")]' +xpath_url: str = ".//@href" +xpath_title: str = ".//h3/text()[1]" +xpath_authors: str = './/div[contains(@class, "italic")]' +xpath_publisher: str = './/div[contains(@class, "text-sm")]' +xpath_file_info: str = './/div[contains(@class, "text-xs")]' + + +def request(query, params: Dict[str, Any]) -> Dict[str, Any]: + search_url: str = base_url + "/search?q={search_query}" + params["url"] = search_url.format(search_query=quote(query)) + return params + + +def response(resp) -> List[Dict[str, Optional[str]]]: + results: List[Dict[str, Optional[str]]] = [] + dom = html.fromstring(resp.text) + + for item in dom.xpath(xpath_results): + result: Dict[str, Optional[str]] = {} + + result["url"] = base_url + item.xpath(xpath_url)[0] + + result["title"] = extract_text(eval_xpath(item, xpath_title)) + + result["content"] = "{publisher}. {authors}. {file_info}".format( + authors=extract_text(eval_xpath(item, xpath_authors)), + publisher=extract_text(eval_xpath(item, xpath_publisher)), + file_info=extract_text(eval_xpath(item, xpath_file_info)), + ) + + results.append(result) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index b6bb0a0e3..561ec41a9 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -297,6 +297,13 @@ engines: shortcut: 9g disabled: true + - name: anna's archive + engine: annas-archive + paging: False + categories: files + disabled: true + shortcut: aa + - name: apk mirror engine: apkmirror timeout: 4.0 From 7adb9090e5dbc25b0d120772beca01dc4eb0791e Mon Sep 17 00:00:00 2001 From: Paolo Basso <12545838+paolobasso99@users.noreply.github.com> Date: Sun, 25 Jun 2023 17:24:28 +0200 Subject: [PATCH 2/5] [mod] engine: Anna's Archive - add language support --- searx/engines/{annas-archive.py => annas_archive.py} | 9 +++++++-- searx/settings.yml | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) rename searx/engines/{annas-archive.py => annas_archive.py} (87%) diff --git a/searx/engines/annas-archive.py b/searx/engines/annas_archive.py similarity index 87% rename from searx/engines/annas-archive.py rename to searx/engines/annas_archive.py index 56d1ca77a..1d5aa41ee 100644 --- a/searx/engines/annas-archive.py +++ b/searx/engines/annas_archive.py @@ -36,8 +36,13 @@ xpath_file_info: str = './/div[contains(@class, "text-xs")]' def request(query, params: Dict[str, Any]) -> Dict[str, Any]: - search_url: str = base_url + "/search?q={search_query}" - params["url"] = search_url.format(search_query=quote(query)) + search_url: str = base_url + "/search?q={search_query}&lang={lang}" + lang: str = "" + if params["language"] != "all": + lang = params["language"] + + params["url"] = search_url.format(search_query=quote(query), lang=lang) + print(params) return params diff --git a/searx/settings.yml b/searx/settings.yml index 561ec41a9..8877fba54 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -298,8 +298,8 @@ engines: disabled: true - name: anna's archive - engine: annas-archive - paging: False + engine: annas_archive + paging: false categories: files disabled: true shortcut: aa From eafc2906f1ec6be52e89f5bd364093c5f1e66856 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 27 Jun 2023 16:17:17 +0200 Subject: [PATCH 3/5] [mod] engine: Anna's Archive - fetch search arguments from search form Signed-off-by: Markus Heiser --- searx/data/engine_traits.json | 132 ++++++++++++++++++++++++++++++++- searx/engines/annas_archive.py | 53 ++++++++++++- searx/settings.yml | 2 +- 3 files changed, 183 insertions(+), 4 deletions(-) diff --git a/searx/data/engine_traits.json b/searx/data/engine_traits.json index aef8bae0b..072c9a5c4 100644 --- a/searx/data/engine_traits.json +++ b/searx/data/engine_traits.json @@ -1,4 +1,134 @@ { + "annas archive": { + "all_locale": "", + "custom": { + "content": [ + "", + "journal_article", + "book_any", + "book_fiction", + "book_unknown", + "book_nonfiction", + "book_comic", + "magazine", + "standards_document" + ], + "ext": [ + "", + "pdf", + "epub", + "cbr", + "fb2", + "mobi", + "cbz", + "djvu", + "azw3", + "fb2.zip", + "txt", + "rar", + "zip", + "doc", + "lit", + "rtf", + "htm", + "html", + "lrf", + "mht", + "docx" + ], + "sort": [ + "", + "newest", + "oldest", + "largest", + "smallest" + ] + }, + "data_type": "traits_v1", + "languages": { + "af": "af", + "ar": "ar", + "az": "az", + "be": "be", + "bg": "bg", + "bn": "bn", + "bo": "bo", + "bs": "bs", + "ca": "ca", + "cs": "cs", + "da": "da", + "de": "de", + "el": "el", + "en": "en", + "eo": "eo", + "es": "es", + "et": "et", + "eu": "eu", + "fa": "fa", + "fi": "fi", + "fil": "tl", + "fr": "fr", + "gl": "gl", + "gu": "gu", + "he": "he", + "hi": "hi", + "hr": "hr", + "hu": "hu", + "hy": "hy", + "id": "id", + "is": "is", + "it": "it", + "ja": "ja", + "ka": "ka", + "kk": "kk", + "kn": "kn", + "ko": "ko", + "ku": "ku", + "ky": "ky", + "lo": "lo", + "lt": "lt", + "lv": "lv", + "mk": "mk", + "ml": "ml", + "mn": "mn", + "mr": "mr", + "ms": "ms", + "my": "my", + "nb": "nb", + "ne": "ne", + "nl": "nl", + "no": "no", + "pa": "pa", + "pl": "pl", + "ps": "ps", + "pt": "pt", + "ro": "ro", + "ru": "ru", + "sa": "sa", + "sd": "sd", + "si": "si", + "sk": "sk", + "sl": "sl", + "so": "so", + "sq": "sq", + "sr": "sr", + "sv": "sv", + "sw": "sw", + "ta": "ta", + "te": "te", + "tg": "tg", + "tr": "tr", + "tt": "tt", + "ug": "ug", + "uk": "uk", + "ur": "ur", + "uz": "uz", + "vi": "vi", + "yi": "yi", + "zh": "zh" + }, + "regions": {} + }, "arch linux wiki": { "all_locale": null, "custom": { @@ -4127,4 +4257,4 @@ }, "regions": {} } -} \ No newline at end of file +} diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py index 1d5aa41ee..c845d67c6 100644 --- a/searx/engines/annas_archive.py +++ b/searx/engines/annas_archive.py @@ -7,7 +7,8 @@ from typing import List, Dict, Any, Optional from urllib.parse import quote from lxml import html -from searx.utils import extract_text, eval_xpath +from searx.utils import extract_text, eval_xpath, eval_xpath_list +from searx.enginelib.traits import EngineTraits # about about: Dict[str, Any] = { @@ -42,7 +43,6 @@ def request(query, params: Dict[str, Any]) -> Dict[str, Any]: lang = params["language"] params["url"] = search_url.format(search_query=quote(query), lang=lang) - print(params) return params @@ -66,3 +66,52 @@ def response(resp) -> List[Dict[str, Optional[str]]]: results.append(result) return results + + +def fetch_traits(engine_traits: EngineTraits): + """Fetch languages and other search arguments from Anna's search form.""" + # pylint: disable=import-outside-toplevel + + import babel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + from searx.locales import language_tag + + engine_traits.all_locale = '' + engine_traits.custom['content'] = [] + engine_traits.custom['ext'] = [] + engine_traits.custom['sort'] = [] + + resp = get(base_url + '/search') + if not resp.ok: # type: ignore + raise RuntimeError("Response from Anna's search page is not OK.") + dom = html.fromstring(resp.text) # type: ignore + + # supported language codes + + lang_map = {} + for x in eval_xpath_list(dom, "//form//select[@name='lang']//option"): + eng_lang = x.get("value") + if eng_lang in ('', '_empty', 'nl-BE', 'und'): + continue + try: + locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-') + except babel.UnknownLocaleError: + # silently ignore unknown languages + # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang)) + continue + sxng_lang = language_tag(locale) + conflict = engine_traits.languages.get(sxng_lang) + if conflict: + if conflict != eng_lang: + print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang)) + continue + engine_traits.languages[sxng_lang] = eng_lang + + for x in eval_xpath_list(dom, "//form//select[@name='content']//option"): + engine_traits.custom['content'].append(x.get("value")) + + for x in eval_xpath_list(dom, "//form//select[@name='ext']//option"): + engine_traits.custom['ext'].append(x.get("value")) + + for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"): + engine_traits.custom['sort'].append(x.get("value")) diff --git a/searx/settings.yml b/searx/settings.yml index 8877fba54..e42373a82 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -297,7 +297,7 @@ engines: shortcut: 9g disabled: true - - name: anna's archive + - name: annas archive engine: annas_archive paging: false categories: files From e2df6b77a3985f1c4b4dc0372332209076abf308 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 27 Jun 2023 18:51:27 +0200 Subject: [PATCH 4/5] [mod] engine: Anna's Archive - additionl settings (content, sort, ext) Signed-off-by: Markus Heiser --- docs/src/searx.engines.annas_archive.rst | 2 + searx/engines/annas_archive.py | 95 ++++++++++++++++++++++-- searx/settings.yml | 10 ++- 3 files changed, 98 insertions(+), 9 deletions(-) create mode 100644 docs/src/searx.engines.annas_archive.rst diff --git a/docs/src/searx.engines.annas_archive.rst b/docs/src/searx.engines.annas_archive.rst new file mode 100644 index 000000000..377f42082 --- /dev/null +++ b/docs/src/searx.engines.annas_archive.rst @@ -0,0 +1,2 @@ +.. automodule:: searx.engines.annas_archive + :members: diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py index c845d67c6..cebc8d45c 100644 --- a/searx/engines/annas_archive.py +++ b/searx/engines/annas_archive.py @@ -1,14 +1,59 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Anna's Archive +""".. _annas_archive engine: + +============== +Anna's Archive +============== + +.. _Anna's Archive: https://annas-archive.org/ +.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive + +`Anna's Archive`_ is a free non-profit online shadow library metasearch engine +providing access to a variety of book resources (also via IPFS), created by a +team of anonymous archivists (AnnaArchivist_). + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + + +Configuration +============= + +The engine has the following additional settings: + +- :py:obj:`aa_content` +- :py:obj:`aa_ext` +- :py:obj:`aa_sort` + +With this options a SearXNG maintainer is able to configure **additional** +engines for specific searches in Anna's Archive. For example a engine to search +for *newest* articles and journals (PDF) / by shortcut ``!aaa ``. + +.. code:: yaml + + - name: annas articles + engine: annas_archive + shortcut: aaa + aa_content: 'journal_article' + aa_ext: 'pdf' + aa_sort: 'newest' + + +Implementations +=============== """ + from typing import List, Dict, Any, Optional from urllib.parse import quote from lxml import html from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.enginelib.traits import EngineTraits +from searx.data import ENGINE_TRAITS # about about: Dict[str, Any] = { @@ -26,6 +71,31 @@ paging: bool = False # search-url base_url: str = "https://annas-archive.org" +aa_content: str = "" +"""Anan's search form field **Content** / possible values:: + + journal_article, book_any, book_fiction, book_unknown, book_nonfiction, + book_comic, magazine, standards_document + +To not filter use an empty string (default). +""" +aa_sort: str = '' +"""Sort Anna's results, possible values:: + + newest, oldest, largest, smallest + +To sort by *most relevant* use an empty string (default).""" + +aa_ext: str = '' +"""Filter Anna's results by a file ending. Common filters for example are +``pdf`` and ``epub``. + +.. note:: + + Anna's Archive is a beta release: Filter results by file extension does not + really work on Anna's Archive. + +""" # xpath queries xpath_results: str = '//main//a[starts-with(@href,"/md5")]' @@ -36,13 +106,24 @@ xpath_publisher: str = './/div[contains(@class, "text-sm")]' xpath_file_info: str = './/div[contains(@class, "text-xs")]' -def request(query, params: Dict[str, Any]) -> Dict[str, Any]: - search_url: str = base_url + "/search?q={search_query}&lang={lang}" - lang: str = "" - if params["language"] != "all": - lang = params["language"] +def init(engine_settings=None): # pylint: disable=unused-argument + """Check of engine's settings.""" + traits = EngineTraits(**ENGINE_TRAITS['annas archive']) + + if aa_content and aa_content not in traits.custom['content']: + raise ValueError(f'invalid setting content: {aa_content}') + + if aa_sort and aa_sort not in traits.custom['sort']: + raise ValueError(f'invalid setting sort: {aa_sort}') - params["url"] = search_url.format(search_query=quote(query), lang=lang) + if aa_ext and aa_ext not in traits.custom['ext']: + raise ValueError(f'invalid setting ext: {aa_ext}') + + +def request(query, params: Dict[str, Any]) -> Dict[str, Any]: + q = quote(query) + lang = traits.get_language(params["language"], traits.all_locale) # type: ignore + params["url"] = base_url + f"/search?lang={lang or ''}&content={aa_content}&ext={aa_ext}&sort={aa_sort}&q={q}" return params diff --git a/searx/settings.yml b/searx/settings.yml index e42373a82..ccf897cc3 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -299,11 +299,17 @@ engines: - name: annas archive engine: annas_archive - paging: false - categories: files disabled: true shortcut: aa + # - name: annas articles + # engine: annas_archive + # shortcut: aaa + # # https://docs.searxng.org/src/searx.engines.annas_archive.html + # aa_content: 'journal_article' # book_any .. magazine, standards_document + # aa_ext: 'pdf' # pdf, epub, .. + # aa_sort: 'newest' # newest, oldest, largest, smallest + - name: apk mirror engine: apkmirror timeout: 4.0 From 87e7926ae96bc394427859c3688037c0d1710230 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 28 Jun 2023 09:16:49 +0200 Subject: [PATCH 5/5] [fix] engine: Anna's Archive - grep results from '.js-scroll-hidden' elements The renderuing of the WEB page is very strange; except the firts position all other positions of Anna's result page are enclosed in SGML comments. These cooments are *uncommented* by some JS code, see query of the class '.js-scroll-hidden' in Anna's HTML template [1]. [1] https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html Signed-off-by: Markus Heiser --- searx/engines/annas_archive.py | 40 ++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py index cebc8d45c..db9bd1719 100644 --- a/searx/engines/annas_archive.py +++ b/searx/engines/annas_archive.py @@ -97,14 +97,6 @@ aa_ext: str = '' """ -# xpath queries -xpath_results: str = '//main//a[starts-with(@href,"/md5")]' -xpath_url: str = ".//@href" -xpath_title: str = ".//h3/text()[1]" -xpath_authors: str = './/div[contains(@class, "italic")]' -xpath_publisher: str = './/div[contains(@class, "text-sm")]' -xpath_file_info: str = './/div[contains(@class, "text-xs")]' - def init(engine_settings=None): # pylint: disable=unused-argument """Check of engine's settings.""" @@ -131,22 +123,32 @@ def response(resp) -> List[Dict[str, Optional[str]]]: results: List[Dict[str, Optional[str]]] = [] dom = html.fromstring(resp.text) - for item in dom.xpath(xpath_results): - result: Dict[str, Optional[str]] = {} + for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'): + results.append(_get_result(item)) - result["url"] = base_url + item.xpath(xpath_url)[0] + # The rendering of the WEB page is very strange; except the first position + # all other positions of Anna's result page are enclosed in SGML comments. + # These comments are *uncommented* by some JS code, see query of class + # '.js-scroll-hidden' in Anna's HTML template: + # https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html - result["title"] = extract_text(eval_xpath(item, xpath_title)) + for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'): + item = html.fromstring(item.xpath('./comment()')[0].text) + results.append(_get_result(item)) - result["content"] = "{publisher}. {authors}. {file_info}".format( - authors=extract_text(eval_xpath(item, xpath_authors)), - publisher=extract_text(eval_xpath(item, xpath_publisher)), - file_info=extract_text(eval_xpath(item, xpath_file_info)), - ) + return results - results.append(result) - return results +def _get_result(item): + return { + 'template': 'paper.html', + 'url': base_url + item.xpath('./@href')[0], + 'title': extract_text(eval_xpath(item, './/h3/text()[1]')), + 'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')), + 'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))], + 'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')), + 'img_src': item.xpath('.//img/@src')[0], + } def fetch_traits(engine_traits: EngineTraits):