From f1f5e69c425389a5cb7e7a437b3a39c0d7513022 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 5 Jan 2022 13:00:52 +0100 Subject: [PATCH] [fix] startpage engine - avoid captcha Startpage has introduced new anti-scraping measures that make SearXNG instances run into captchas: 1. some arguments has been removed and a new `sc` has been added. 2. search path changed from `do/search` to `sp/search` 3. POST request is no longer needed Closes: https://github.com/searxng/searxng/issues/692 Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 97891921c..1fd259dad 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -3,6 +3,8 @@ Startpage (Web) """ +from urllib.parse import urlencode + from lxml import html from dateutil import parser from datetime import datetime, timedelta @@ -33,7 +35,7 @@ supported_languages_url = 'https://www.startpage.com/do/settings' # search-url base_url = 'https://startpage.com/' -search_url = base_url + 'do/search' +search_url = base_url + 'sp/search?' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] @@ -46,14 +48,12 @@ content_xpath = './/p[@class="w-gl__description"]' # do search-request def request(query, params): - params['url'] = search_url - params['method'] = 'POST' - params['data'] = { + args = { 'query': query, 'page': params['pageno'], 'cat': 'web', - 'cmd': 'process_search', - 'engine0': 'v1all', + # 'abp': "-1", + 'sc': 'Mj4jZy61QETj20', } # set language if specified @@ -61,9 +61,10 @@ def request(query, params): lang_code = match_language(params['language'], supported_languages, fallback=None) if lang_code: language_name = supported_languages[lang_code]['alias'] - params['data']['language'] = language_name - params['data']['lui'] = language_name + args['language'] = language_name + args['lui'] = language_name + params['url'] = search_url + urlencode(args) return params