Merge pull request #1560 from return42/http-accept-language

[mod] add 'Accept-Language' HTTP header to online processores
dependabot/pip/master/sphinx-6.1.3
Markus Heiser 2 years ago committed by GitHub
commit 7c9c112484
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -440,6 +440,7 @@ engine is shown. Most of the options have a default value or even are optional.
engine: example engine: example
shortcut: demo shortcut: demo
base_url: 'https://{language}.example.com/' base_url: 'https://{language}.example.com/'
send_accept_language_header: false
categories: general categories: general
timeout: 3.0 timeout: 3.0
api_key: 'apikey' api_key: 'apikey'
@ -488,6 +489,13 @@ engine is shown. Most of the options have a default value or even are optional.
use multiple sites using only one engine, or updating the site URL without use multiple sites using only one engine, or updating the site URL without
touching at the code. touching at the code.
``send_accept_language_header`` :
Several engines that support languages (or regions) deal with the HTTP header
``Accept-Language`` to build a response that fits to the locale. When this
option is activated, the language (locale) that is selected by the user is used
to build and send a ``Accept-Language`` header in the request to the origin
search engine.
``categories`` : optional ``categories`` : optional
Define in which categories this engine will be active. Most of the time, it is Define in which categories this engine will be active. Most of the time, it is
defined in the code of the engine, but in a few cases it is useful, like when defined in the code of the engine, but in a few cases it is useful, like when

@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
"enable_http": False, "enable_http": False,
"using_tor_proxy": False, "using_tor_proxy": False,
"display_error_messages": True, "display_error_messages": True,
"send_accept_language_header": False,
"tokens": [], "tokens": [],
"about": {}, "about": {},
} }

@ -25,6 +25,7 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = False time_range_support = False
safesearch = False safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {} language_aliases = {}
@ -68,7 +69,6 @@ def request(query, params):
logger.debug("headers.Referer --> %s", referer) logger.debug("headers.Referer --> %s", referer)
params['url'] = base_url + search_path params['url'] = base_url + search_path
params['headers']['Accept-Language'] = "en-US,en;q=0.5"
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params

@ -31,6 +31,7 @@ categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28 number_of_results = 28

@ -34,6 +34,7 @@ about = {
categories = ['news'] categories = ['news']
paging = True paging = True
time_range_support = True time_range_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'

@ -30,6 +30,7 @@ categories = ['videos', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
number_of_results = 28 number_of_results = 28
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
@ -70,10 +71,6 @@ def request(query, params):
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
# bing videos did not like "older" versions < 70.0.1 when selectin other
# languages then 'en' .. very strange ?!?!
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
return params return params

@ -20,6 +20,7 @@ from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
engine_type = 'online' engine_type = 'online'
send_accept_language_header = True
categories = ['general'] categories = ['general']
disabled = True disabled = True
timeout = 2.0 timeout = 2.0

@ -31,6 +31,7 @@ categories = ['general', 'web']
paging = True paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js' supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True time_range_support = True
send_accept_language_header = True
language_aliases = { language_aliases = {
'ar-SA': 'ar-XA', 'ar-SA': 'ar-XA',

@ -27,6 +27,8 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/'] WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@ -62,7 +64,6 @@ def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query})) params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases) language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0] language = language.split('-')[0]
params['headers']['Accept-Language'] = language
return params return params

@ -30,6 +30,7 @@ about = {
categories = ['images', 'web'] categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
send_accept_language_header = True
# search-url # search-url
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

@ -45,6 +45,7 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
use_mobile_ui = False use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages' supported_languages_url = 'https://www.google.com/preferences?#languages'
@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
# language. # language.
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
# Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
ret_val['headers']['Accept-Language'] = ','.join(
[
lang_country,
language + ';q=0.8,',
'en;q=0.6',
'*;q=0.5',
]
)
return ret_val return ret_val

@ -51,6 +51,7 @@ paging = False
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'} filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
@ -125,7 +126,6 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = ( query_url = (
'https://' 'https://'

@ -70,13 +70,13 @@ time_range_support = True
# #
# safesearch : results are identitical for safesearch=0 and safesearch=2 # safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False safesearch = False
send_accept_language_header = True
def request(query, params): def request(query, params):
"""Google-News search request""" """Google-News search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# google news has only one domain # google news has only one domain
lang_info['subdomain'] = 'news.google.com' lang_info['subdomain'] = 'news.google.com'

@ -22,6 +22,8 @@ about = {
} }
categories = ["files", "apps"] categories = ["files", "apps"]
send_accept_language_header = True
search_url = "https://play.google.com/store/search?{query}&c=apps" search_url = "https://play.google.com/store/search?{query}&c=apps"

@ -52,6 +52,7 @@ language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = False safesearch = False
send_accept_language_header = True
def time_range_url(params): def time_range_url(params):
@ -75,7 +76,6 @@ def request(query, params):
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# subdomain is: scholar.google.xy # subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

@ -60,6 +60,7 @@ language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
RE_CACHE = {} RE_CACHE = {}
@ -111,7 +112,6 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = ( query_url = (
'https://' 'https://'

@ -30,6 +30,7 @@ about = {
categories = ['map'] categories = ['map']
paging = False paging = False
language_support = True language_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://nominatim.openstreetmap.org/' base_url = 'https://nominatim.openstreetmap.org/'
@ -142,9 +143,8 @@ def request(query, params):
params['url'] = base_url + search_string.format(query=urlencode({'q': query})) params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
params['route'] = route_re.match(query) params['route'] = route_re.match(query)
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
if 'Accept-Language' not in params['headers']:
accept_language = 'en' if params['language'] == 'all' else params['language'] params['headers']['Accept-Language'] = 'en'
params['headers']['Accept-Language'] = accept_language
return params return params

@ -19,6 +19,9 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
# search-url # search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@ -41,9 +44,6 @@ def request(query, params):
language = url_lang(params['language']) language = url_lang(params['language'])
params['url'] = search_url.format(title=quote(query), language=language) params['url'] = search_url.format(title=quote(query), language=language)
if params['language'].lower() in language_variants.get(language, []):
params['headers']['Accept-Language'] = params['language'].lower()
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2 params['soft_max_redirects'] = 2

@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
import typing import typing
import babel
class EngineRef: class EngineRef:
@ -29,6 +30,7 @@ class SearchQuery:
'query', 'query',
'engineref_list', 'engineref_list',
'lang', 'lang',
'locale',
'safesearch', 'safesearch',
'pageno', 'pageno',
'time_range', 'time_range',
@ -59,6 +61,13 @@ class SearchQuery:
self.external_bang = external_bang self.external_bang = external_bang
self.engine_data = engine_data or {} self.engine_data = engine_data or {}
self.locale = None
if self.lang:
try:
self.locale = babel.Locale.parse(self.lang, sep='-')
except babel.core.UnknownLocaleError:
pass
@property @property
def categories(self): def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list))) return list(set(map(lambda engineref: engineref.category, self.engineref_list)))

@ -138,6 +138,13 @@ class EngineProcessor(ABC):
return False return False
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if request is not supported.
Not supported conditions (``None`` is returned):
- A page-number > 1 when engine does not support paging.
- A time range when the engine does not support time range.
"""
# if paging is not supported, skip # if paging is not supported, skip
if search_query.pageno > 1 and not self.engine.paging: if search_query.pageno > 1 and not self.engine.paging:
return None return None

@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor):
# add an user agent # add an user agent
params['headers']['User-Agent'] = gen_useragent() params['headers']['User-Agent'] = gen_useragent()
# add Accept-Language header
if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language
if search_query.locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
params['headers']['Accept-Language'] = ac_lang
return params return params
def _send_http_request(self, params): def _send_http_request(self, params):

@ -38,6 +38,9 @@ class OnlineCurrencyProcessor(OnlineProcessor):
engine_type = 'online_currency' engine_type = 'online_currency'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match
to :py:obj:`parser_re`."""
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:
return None return None

@ -18,6 +18,8 @@ class OnlineDictionaryProcessor(OnlineProcessor):
engine_type = 'online_dictionary' engine_type = 'online_dictionary'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match
to :py:obj:`parser_re`."""
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:
return None return None

@ -20,6 +20,9 @@ class OnlineUrlSearchProcessor(OnlineProcessor):
engine_type = 'online_url_search' engine_type = 'online_url_search'
def get_params(self, search_query, engine_category): def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match
to at least one of :py:obj:`re_search_urls`.
"""
params = super().get_params(search_query, engine_category) params = super().get_params(search_query, engine_category)
if params is None: if params is None:
return None return None

@ -748,6 +748,7 @@ engines:
- name: google play movies - name: google play movies
engine: xpath engine: xpath
send_accept_language_header: true
search_url: https://play.google.com/store/search?q={query}&c=movies search_url: https://play.google.com/store/search?q={query}&c=movies
results_xpath: '//div[@class="ImZGtf mpg5gc"]' results_xpath: '//div[@class="ImZGtf mpg5gc"]'
title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a' title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'

Loading…
Cancel
Save