From ff527e268170852563830bf5b29a65515a98d2bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Sat, 13 Mar 2021 20:27:47 +0100 Subject: [PATCH 1/6] Add Solr engine --- Makefile | 1 + searx/engines/solr.py | 74 +++++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 11 +++++++ 3 files changed, 86 insertions(+) create mode 100644 searx/engines/solr.py diff --git a/Makefile b/Makefile index a4e9110ed..745ff5b91 100644 --- a/Makefile +++ b/Makefile @@ -192,6 +192,7 @@ PYLINT_FILES=\ searx/engines/google_images.py \ searx/engines/mediathekviewweb.py \ searx/engines/solidtorrents.py \ + searx/engines/solr.py \ searx/engines/google_scholar.py \ searx/engines/yahoo_news.py \ searx/engines/apkmirror.py \ diff --git a/searx/engines/solr.py b/searx/engines/solr.py new file mode 100644 index 000000000..0bfcbab36 --- /dev/null +++ b/searx/engines/solr.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Solr +""" + +# pylint: disable=global-statement, missing-function-docstring + +from json import loads +from urllib.parse import urlencode +from searx.exceptions import SearxEngineAPIException + + +base_url = 'http://localhost:8983' +collection = '' +rows = 10 +sort = '' # sorting: asc or desc +field_list = 'name' # list of field names to display on the UI +default_fields = '' # default field to query +query_fields = '' # query fields +_search_url = '' +paging = True + + +def init(_): + if collection == '': + raise ValueError('collection cannot be empty') + + global _search_url + _search_url = base_url + '/solr/' + collection + '/select?{params}' + + +def request(query, params): + query_params = {'q': query, 'rows': rows} + if field_list != '': + query_params['fl'] = field_list + if query_fields != '': + query_params['qf'] = query_fields + if default_fields != '': + query_params['df'] = default_fields + if sort != '': + query_params['sort'] = sort + + if 'pageno' in params: + query_params['start'] = rows * (params['pageno'] - 1) + + params['url'] = _search_url.format(params=urlencode(query_params)) + + return params + + +def response(resp): + resp_json = __get_response(resp) + + results = [] + for result in resp_json['response']['docs']: + r = {key: str(value) for key, value in result.items()} + if len(r) == 0: + continue + r['template'] = 'key-value.html' + results.append(r) + + return results + + +def __get_response(resp): + try: + resp_json = loads(resp.text) + except Exception as e: + raise SearxEngineAPIException("failed to parse response") from e + + if 'error' in resp_json: + raise SearxEngineAPIException(resp_json['error']['msg']) + + return resp_json diff --git a/searx/settings.yml b/searx/settings.yml index 85ba4b2fe..da84e82b5 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -943,6 +943,17 @@ engines: # api_client_id : ******* # api_client_secret : ******* +# - name : solr +# engine : solr +# shortcut : slr +# base_url : http://localhost:8983 +# collection : collection_name +# sort : '' # sorting: asc or desc +# field_list : '' # comma separated list of field names to display on the UI +# default_fields : '' # default field to query +# query_fields : '' # query fields +# enable_http : True + - name : startpage engine : startpage shortcut : sp From 3703ebb22a6ddd9cc2279afd377fa275f35d417c Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 14 Mar 2021 11:49:18 +0100 Subject: [PATCH 2/6] [drop] Acgsou engine - www.acgsou.com no longer exists - https://www.acgsou.com/ acgsou.com is redirected to 36dm.club - @rinpatch do not plan on maintaining the engine [1] [1] https://github.com/searx/searx/pull/1283#issuecomment-798783585 Signed-off-by: Markus Heiser --- searx/engines/acgsou.py | 74 ----------------------------------------- searx/settings.yml | 6 ---- 2 files changed, 80 deletions(-) delete mode 100644 searx/engines/acgsou.py diff --git a/searx/engines/acgsou.py b/searx/engines/acgsou.py deleted file mode 100644 index ea9793f10..000000000 --- a/searx/engines/acgsou.py +++ /dev/null @@ -1,74 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -""" - Acgsou (Japanese Animation/Music/Comics Bittorrent tracker) -""" - -from urllib.parse import urlencode -from lxml import html -from searx.utils import extract_text, get_torrent_size, eval_xpath_list, eval_xpath_getindex - -# about -about = { - "website": 'https://www.acgsou.com/', - "wikidata_id": None, - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": 'HTML', -} - -# engine dependent config -categories = ['files', 'images', 'videos', 'music'] -paging = True - -# search-url -base_url = 'https://www.acgsou.com/' -search_url = base_url + 'search.php?{query}&page={offset}' -# xpath queries -xpath_results = '//table[contains(@class, "list_style table_fixed")]//tr[not(th)]' -xpath_category = './/td[2]/a[1]' -xpath_title = './/td[3]/a[last()]' -xpath_torrent_links = './/td[3]/a' -xpath_filesize = './/td[4]/text()' - - -def request(query, params): - query = urlencode({'keyword': query}) - params['url'] = search_url.format(query=query, offset=params['pageno']) - return params - - -def response(resp): - results = [] - dom = html.fromstring(resp.text) - for result in eval_xpath_list(dom, xpath_results): - # defaults - filesize = 0 - magnet_link = "magnet:?xt=urn:btih:{}&tr=https://tracker.acgsou.com:2710/announce" - - category = extract_text(eval_xpath_getindex(result, xpath_category, 0, default=[])) - page_a = eval_xpath_getindex(result, xpath_title, 0) - title = extract_text(page_a) - href = base_url + page_a.attrib.get('href') - - magnet_link = magnet_link.format(page_a.attrib.get('href')[5:-5]) - - filesize_info = eval_xpath_getindex(result, xpath_filesize, 0, default=None) - if filesize_info: - try: - filesize = filesize_info[:-2] - filesize_multiplier = filesize_info[-2:] - filesize = get_torrent_size(filesize, filesize_multiplier) - except: - pass - # I didn't add download/seed/leech count since as I figured out they are generated randomly everytime - content = 'Category: "{category}".' - content = content.format(category=category) - - results.append({'url': href, - 'title': title, - 'content': content, - 'filesize': filesize, - 'magnetlink': magnet_link, - 'template': 'torrent.html'}) - return results diff --git a/searx/settings.yml b/searx/settings.yml index 85ba4b2fe..4c5b3c0e5 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -737,12 +737,6 @@ engines: shortcut : nt disabled : True - - name : acgsou - engine : acgsou - shortcut : acg - disabled : True - timeout: 5.0 - - name : openairedatasets engine : json_engine paging : True From 6e1f1085efc670dd548e931834090dce6d9764b3 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 14 Mar 2021 15:13:57 +0100 Subject: [PATCH 3/6] [fix] remove unused import from yahoo-news engine Signed-off-by: Markus Heiser --- searx/engines/yahoo_news.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 5f6734cb3..49b3d1bf8 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -22,13 +22,6 @@ from searx.utils import ( from searx.engines.yahoo import parse_url -# pylint: disable=unused-import -from searx.engines.yahoo import ( - _fetch_supported_languages, - supported_languages_url, -) -# pylint: enable=unused-import - logger = logger.getChild('yahoo_news engine') # about From f4a0a4d756626ed2b97f233e7a9ee6ac4d1deb8a Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sun, 14 Mar 2021 20:16:16 -0700 Subject: [PATCH 4/6] fix HTTP error in onion engines regression from https://github.com/searx/searx/pull/2641 most onion websites only serve HTTP, so it must be enabled --- searx/settings.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/searx/settings.yml b/searx/settings.yml index 85ba4b2fe..8f6a875b4 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -82,6 +82,8 @@ outgoing: # communication with search engines # https: # - http://proxy1:8080 # - http://proxy2:8080 +# using_tor_proxy : True +# extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy # uncomment below section only if you have more than one network interface # which can be the source of outgoing search requests # source_ips: @@ -159,6 +161,7 @@ engines: - name : ahmia engine : ahmia categories : onions + enable_http : True shortcut : ah - name : arch linux wiki @@ -730,6 +733,8 @@ engines: # Requires Tor - name : not evil engine : not_evil + categories : onions + enable_http : True shortcut : ne - name : nyaa @@ -979,6 +984,7 @@ engines: title_xpath : ./td[2]/b content_xpath : ./td[2]/small categories : onions + enable_http : True shortcut : tch # maybe in a fun category From f97b4ff7b6607f4da66bc0f67b14b29317011cd2 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 15 Mar 2021 17:21:46 +0100 Subject: [PATCH 5/6] [fix] update youtube_noapi paging --- searx/engines/youtube_noapi.py | 79 +++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 90b93f0a4..5b9e3e3f4 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -4,7 +4,7 @@ """ from functools import reduce -from json import loads +from json import loads, dumps from urllib.parse import quote_plus # about @@ -20,12 +20,15 @@ about = { # engine dependent config categories = ['videos', 'music'] paging = True +language_support = False time_range_support = True # search-url base_url = 'https://www.youtube.com/results' search_url = base_url + '?search_query={query}&page={page}' time_range_url = '&sp=EgII{time_range}%253D%253D' +# the key seems to be constant +next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' time_range_dict = {'day': 'Ag', 'week': 'Aw', 'month': 'BA', @@ -40,21 +43,73 @@ base_youtube_url = 'https://www.youtube.com/watch?v=' # do search-request def request(query, params): - params['url'] = search_url.format(query=quote_plus(query), - page=params['pageno']) - if params['time_range'] in time_range_dict: - params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + if not params['engine_data'].get('next_page_token'): + params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) + if params['time_range'] in time_range_dict: + params['url'] += time_range_url.format(time_range=time_range_dict[params['time_range']]) + else: + print(params['engine_data']['next_page_token']) + params['url'] = next_page_url + params['method'] = 'POST' + params['data'] = dumps({ + 'context': {"client": {"clientName": "WEB", "clientVersion": "2.20210310.12.01"}}, + 'continuation': params['engine_data']['next_page_token'], + }) + params['headers']['Content-Type'] = 'application/json' return params # get response from search-request def response(resp): + if resp.search_params.get('engine_data'): + return parse_next_page_response(resp.text) + return parse_first_page_response(resp.text) + + +def parse_next_page_response(response_text): results = [] + result_json = loads(response_text) + with open("/tmp/x", "w") as f: + f.write(response_text) + for section in (result_json['onResponseReceivedCommands'][0] + .get('appendContinuationItemsAction')['continuationItems'][0] + .get('itemSectionRenderer')['contents']): + if 'videoRenderer' not in section: + continue + section = section['videoRenderer'] + content = "-" + if 'descriptionSnippet' in section: + content = ' '.join(x['text'] for x in section['descriptionSnippet']['runs']) + results.append({ + 'url': base_youtube_url + section['videoId'], + 'title': ' '.join(x['text'] for x in section['title']['runs']), + 'content': content, + 'author': section['ownerText']['runs'][0]['text'], + 'length': section['lengthText']['simpleText'], + 'template': 'videos.html', + 'embedded': embedded_url.format(videoid=section['videoId']), + 'thumbnail': section['thumbnail']['thumbnails'][-1]['url'], + }) + try: + token = result_json['onResponseReceivedCommands'][0]\ + .get('appendContinuationItemsAction')['continuationItems'][1]\ + .get('continuationItemRenderer')['continuationEndpoint']\ + .get('continuationCommand')['token'] + results.append({ + "engine_data": token, + "key": "next_page_token", + }) + except: + pass - results_data = resp.text[resp.text.find('ytInitialData'):] + return results + + +def parse_first_page_response(response_text): + results = [] + results_data = response_text[response_text.find('ytInitialData'):] results_data = results_data[results_data.find('{'):results_data.find(';')] - results_json = loads(results_data) if results_data else {} sections = results_json.get('contents', {})\ .get('twoColumnSearchResultsRenderer', {})\ @@ -63,6 +118,16 @@ def response(resp): .get('contents', []) for section in sections: + if "continuationItemRenderer" in section: + next_page_token = section["continuationItemRenderer"]\ + .get("continuationEndpoint", {})\ + .get("continuationCommand", {})\ + .get("token", "") + if next_page_token: + results.append({ + "engine_data": next_page_token, + "key": "next_page_token", + }) for video_container in section.get('itemSectionRenderer', {}).get('contents', []): video = video_container.get('videoRenderer', {}) videoid = video.get('videoId') From 8158d8654a045cd15c9ae94facf79b89473ba092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Mon, 15 Mar 2021 20:21:28 +0100 Subject: [PATCH 6/6] fix Microsoft Academic engine --- searx/engines/microsoft_academic.py | 57 +++++++++++++---------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py index 14de4ac9a..82a5d3550 100644 --- a/searx/engines/microsoft_academic.py +++ b/searx/engines/microsoft_academic.py @@ -3,10 +3,7 @@ Microsoft Academic (Science) """ -from datetime import datetime -from json import loads -from uuid import uuid4 -from urllib.parse import urlencode +from json import dumps, loads from searx.utils import html_to_text # about @@ -21,26 +18,25 @@ about = { categories = ['images'] paging = True -result_url = 'https://academic.microsoft.com/api/search/GetEntityResults?{query}' +search_url = 'https://academic.microsoft.com/api/search' +_paper_url = 'https://academic.microsoft.com/paper/{id}/reference' def request(query, params): - correlation_id = uuid4() - msacademic = uuid4() - time_now = datetime.now() - - params['url'] = result_url.format(query=urlencode({'correlationId': correlation_id})) - params['cookies']['msacademic'] = str(msacademic) - params['cookies']['ai_user'] = 'vhd0H|{now}'.format(now=str(time_now)) + params['url'] = search_url params['method'] = 'POST' - params['data'] = { - 'Query': '@{query}@'.format(query=query), - 'Limit': 10, - 'Offset': params['pageno'] - 1, - 'Filters': '', - 'OrderBy': '', - 'SortAscending': False, - } + params['headers']['content-type'] = 'application/json; charset=utf-8' + params['data'] = dumps({ + 'query': query, + 'queryExpression': '', + 'filters': [], + 'orderBy': 0, + 'skip': (params['pageno'] - 1) * 10, + 'sortAscending': True, + 'take': 10, + 'includeCitationContexts': False, + 'profileId': '', + }) return params @@ -51,10 +47,13 @@ def response(resp): if not response_data: return results - for result in response_data['results']: - url = _get_url(result) - title = result['e']['dn'] - content = _get_content(result) + for result in response_data['pr']: + if 'dn' not in result['paper']: + continue + + title = result['paper']['dn'] + content = _get_content(result['paper']) + url = _paper_url.format(id=result['paper']['id']) results.append({ 'url': url, 'title': html_to_text(title), @@ -64,15 +63,9 @@ def response(resp): return results -def _get_url(result): - if 's' in result['e']: - return result['e']['s'][0]['u'] - return 'https://academic.microsoft.com/#/detail/{pid}'.format(pid=result['id']) - - def _get_content(result): - if 'd' in result['e']: - content = result['e']['d'] + if 'd' in result: + content = result['d'] if len(content) > 300: return content[:300] + '...' return content