[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
dependabot/pip/master/sphinx-6.1.3
Markus Heiser 2 years ago
parent a2badb4fe4
commit 8df1f0c47e

@ -440,6 +440,7 @@ engine is shown. Most of the options have a default value or even are optional.
engine: example engine: example
shortcut: demo shortcut: demo
base_url: 'https://{language}.example.com/' base_url: 'https://{language}.example.com/'
send_accept_language_header: false
categories: general categories: general
timeout: 3.0 timeout: 3.0
api_key: 'apikey' api_key: 'apikey'
@ -488,6 +489,13 @@ engine is shown. Most of the options have a default value or even are optional.
use multiple sites using only one engine, or updating the site URL without use multiple sites using only one engine, or updating the site URL without
touching at the code. touching at the code.
``send_accept_language_header`` :
Several engines that support languages (or regions) deal with the HTTP header
``Accept-Language`` to build a response that fits to the locale. When this
option is activated, the language (locale) that is selected by the user is used
to build and send a ``Accept-Language`` header in the request to the origin
search engine.
``categories`` : optional ``categories`` : optional
Define in which categories this engine will be active. Most of the time, it is Define in which categories this engine will be active. Most of the time, it is
defined in the code of the engine, but in a few cases it is useful, like when defined in the code of the engine, but in a few cases it is useful, like when

@ -44,6 +44,7 @@ ENGINE_DEFAULT_ARGS = {
"enable_http": False, "enable_http": False,
"using_tor_proxy": False, "using_tor_proxy": False,
"display_error_messages": True, "display_error_messages": True,
"send_accept_language_header": False,
"tokens": [], "tokens": [],
"about": {}, "about": {},
} }

@ -25,6 +25,7 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = False time_range_support = False
safesearch = False safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {} language_aliases = {}
@ -68,7 +69,6 @@ def request(query, params):
logger.debug("headers.Referer --> %s", referer) logger.debug("headers.Referer --> %s", referer)
params['url'] = base_url + search_path params['url'] = base_url + search_path
params['headers']['Accept-Language'] = "en-US,en;q=0.5"
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
return params return params

@ -31,6 +31,7 @@ categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general' supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28 number_of_results = 28

@ -34,6 +34,7 @@ about = {
categories = ['news'] categories = ['news']
paging = True paging = True
time_range_support = True time_range_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'

@ -30,6 +30,7 @@ categories = ['videos', 'web']
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
number_of_results = 28 number_of_results = 28
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
@ -70,10 +71,6 @@ def request(query, params):
if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']]) params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
# bing videos did not like "older" versions < 70.0.1 when selectin other
# languages then 'en' .. very strange ?!?!
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0.1) Gecko/20100101 Firefox/73.0.1'
return params return params

@ -20,6 +20,7 @@ from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
engine_type = 'online' engine_type = 'online'
send_accept_language_header = True
categories = ['general'] categories = ['general']
disabled = True disabled = True
timeout = 2.0 timeout = 2.0

@ -31,6 +31,7 @@ categories = ['general', 'web']
paging = True paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js' supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True time_range_support = True
send_accept_language_header = True
language_aliases = { language_aliases = {
'ar-SA': 'ar-XA', 'ar-SA': 'ar-XA',

@ -27,6 +27,8 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/'] WIKIDATA_PREFIX = ['http://www.wikidata.org/entity/', 'https://www.wikidata.org/entity/']
@ -62,7 +64,6 @@ def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query})) params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases) language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0] language = language.split('-')[0]
params['headers']['Accept-Language'] = language
return params return params

@ -30,6 +30,7 @@ about = {
categories = ['images', 'web'] categories = ['images', 'web']
paging = True paging = True
safesearch = True safesearch = True
send_accept_language_header = True
# search-url # search-url
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}' images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'

@ -45,6 +45,7 @@ categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
use_mobile_ui = False use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages' supported_languages_url = 'https://www.google.com/preferences?#languages'
@ -241,16 +242,6 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
# language. # language.
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language) ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
# Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5
ret_val['headers']['Accept-Language'] = ','.join(
[
lang_country,
language + ';q=0.8,',
'en;q=0.6',
'*;q=0.5',
]
)
return ret_val return ret_val

@ -51,6 +51,7 @@ paging = False
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'} filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
@ -125,7 +126,6 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = ( query_url = (
'https://' 'https://'

@ -70,13 +70,13 @@ time_range_support = True
# #
# safesearch : results are identitical for safesearch=0 and safesearch=2 # safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False safesearch = False
send_accept_language_header = True
def request(query, params): def request(query, params):
"""Google-News search request""" """Google-News search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# google news has only one domain # google news has only one domain
lang_info['subdomain'] = 'news.google.com' lang_info['subdomain'] = 'news.google.com'

@ -22,6 +22,8 @@ about = {
} }
categories = ["files", "apps"] categories = ["files", "apps"]
send_accept_language_header = True
search_url = "https://play.google.com/store/search?{query}&c=apps" search_url = "https://play.google.com/store/search?{query}&c=apps"

@ -52,6 +52,7 @@ language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = False safesearch = False
send_accept_language_header = True
def time_range_url(params): def time_range_url(params):
@ -75,7 +76,6 @@ def request(query, params):
offset = (params['pageno'] - 1) * 10 offset = (params['pageno'] - 1) * 10
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
# subdomain is: scholar.google.xy # subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")

@ -60,6 +60,7 @@ language_support = True
use_locale_domain = True use_locale_domain = True
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
RE_CACHE = {} RE_CACHE = {}
@ -111,7 +112,6 @@ def request(query, params):
"""Google-Video search request""" """Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False) lang_info = get_lang_info(params, supported_languages, language_aliases, False)
logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language'])
query_url = ( query_url = (
'https://' 'https://'

@ -30,6 +30,7 @@ about = {
categories = ['map'] categories = ['map']
paging = False paging = False
language_support = True language_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://nominatim.openstreetmap.org/' base_url = 'https://nominatim.openstreetmap.org/'
@ -142,9 +143,8 @@ def request(query, params):
params['url'] = base_url + search_string.format(query=urlencode({'q': query})) params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
params['route'] = route_re.match(query) params['route'] = route_re.match(query)
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
if 'Accept-Language' not in params['headers']:
accept_language = 'en' if params['language'] == 'all' else params['language'] params['headers']['Accept-Language'] = 'en'
params['headers']['Accept-Language'] = accept_language
return params return params

@ -19,6 +19,9 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
# search-url # search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
@ -41,9 +44,6 @@ def request(query, params):
language = url_lang(params['language']) language = url_lang(params['language'])
params['url'] = search_url.format(title=quote(query), language=language) params['url'] = search_url.format(title=quote(query), language=language)
if params['language'].lower() in language_variants.get(language, []):
params['headers']['Accept-Language'] = params['language'].lower()
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2 params['soft_max_redirects'] = 2

@ -1,6 +1,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
import typing import typing
import babel
class EngineRef: class EngineRef:
@ -29,6 +30,7 @@ class SearchQuery:
'query', 'query',
'engineref_list', 'engineref_list',
'lang', 'lang',
'locale',
'safesearch', 'safesearch',
'pageno', 'pageno',
'time_range', 'time_range',
@ -59,6 +61,13 @@ class SearchQuery:
self.external_bang = external_bang self.external_bang = external_bang
self.engine_data = engine_data or {} self.engine_data = engine_data or {}
self.locale = None
if self.lang:
try:
self.locale = babel.Locale.parse(self.lang, sep='-')
except babel.core.UnknownLocaleError:
pass
@property @property
def categories(self): def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list))) return list(set(map(lambda engineref: engineref.category, self.engineref_list)))

@ -60,6 +60,17 @@ class OnlineProcessor(EngineProcessor):
# add an user agent # add an user agent
params['headers']['User-Agent'] = gen_useragent() params['headers']['User-Agent'] = gen_useragent()
# add Accept-Language header
if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language
if search_query.locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
params['headers']['Accept-Language'] = ac_lang
return params return params
def _send_http_request(self, params): def _send_http_request(self, params):

@ -748,6 +748,7 @@ engines:
- name: google play movies - name: google play movies
engine: xpath engine: xpath
send_accept_language_header: true
search_url: https://play.google.com/store/search?q={query}&c=movies search_url: https://play.google.com/store/search?q={query}&c=movies
results_xpath: '//div[@class="ImZGtf mpg5gc"]' results_xpath: '//div[@class="ImZGtf mpg5gc"]'
title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a' title_xpath: './/div[@class="RZEgze"]//div[@class="kCSSQe"]//a'

Loading…
Cancel
Save