Merge pull request #2027 from dalf/fix_2018

Add "auto" as a language.
This commit is contained in:
Alexandre Flament 2023-02-20 12:17:38 +01:00 committed by GitHub
commit d669da81fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 162 additions and 115 deletions

View File

@ -1,8 +0,0 @@
.. _autodetect search language:
======================
Search language plugin
======================
.. automodule:: searx.plugins.autodetect_search_language
:members:

View File

@ -1,97 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Plugin to detect the search language from the search query.
The language detection is done by using the fastText_ library (`python
fasttext`_). fastText_ distributes the `language identification model`_, for
reference:
- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_
The `language identification model`_ support the language codes (ISO-639-3)::
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr
ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa
fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io
is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv
mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn
no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd
sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep
vi vls vo wa war wuu xal xmf yi yo yue zh
The `language identification model`_ is harmonized with the SearXNG's language
(locale) model. General conditions of SearXNG's locale model are:
a. SearXNG's locale of a query is passed to the
:py:obj:`searx.locales.get_engine_locale` to get a language and/or region
code that is used by an engine.
b. SearXNG and most of the engines do not support all the languages from
language model and there might be also a discrepancy in the ISO-639-3 and
ISO-639-2 handling (:py:obj:`searx.locales.get_engine_locale`). Further
more, in SearXNG the locales like ``zh-TH`` (``zh-CN``) are mapped to
``zh_Hant`` (``zh_Hans``).
Conclusion: This plugin does only auto-detect the languages a user can select in
the language menu (:py:obj:`supported_langs`).
SearXNG's locale of a query comes from (*highest wins*):
1. The ``Accept-Language`` header from user's HTTP client.
2. The user select a locale in the preferences.
3. The user select a locale from the menu in the query form (e.g. ``:zh-TW``)
4. This plugin is activated in the preferences and the locale (only the language
code / none region code) comes from the fastText's language detection.
Conclusion: There is a conflict between the language selected by the user and
the language from language detection of this plugin. For example, the user
explicitly selects the German locale via the search syntax to search for a term
that is identified as an English term (try ``:de-DE thermomix``, for example).
.. hint::
To SearXNG maintainers; please take into account: under some circumstances
the auto-detection of the language of this plugin could be detrimental to
users expectations. Its not recommended to activate this plugin by
default. It should always be the user's decision whether to activate this
plugin or not.
.. _fastText: https://fasttext.cc/
.. _python fasttext: https://pypi.org/project/fasttext/
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
"""
from flask_babel import gettext
import babel
from searx.utils import detect_language
from searx.languages import language_codes
name = gettext('Autodetect search language')
description = gettext('Automatically detect the query search language and switch to it.')
preference_section = 'general'
default_on = False
supported_langs = set()
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
def pre_search(request, search): # pylint: disable=unused-argument
lang = detect_language(search.search_query.query, min_probability=0)
if lang in supported_langs:
search.search_query.lang = lang
try:
search.search_query.locale = babel.Locale.parse(lang)
except babel.core.UnknownLocaleError:
pass
return True
def init(app, settings): # pylint: disable=unused-argument
for searxng_locale in language_codes:
supported_langs.add(searxng_locale[0].split('-')[0])
return True

View File

@ -154,7 +154,7 @@ class SearchLanguageSetting(EnumStringSetting):
"""Available choices may change, so user's value may not be in choices anymore"""
def _validate_selection(self, selection):
if selection != '' and not VALID_LANGUAGE_CODE.match(selection):
if selection != '' and selection != 'auto' and not VALID_LANGUAGE_CODE.match(selection):
raise ValidationException('Invalid language code: "{0}"'.format(selection))
def parse(self, data: str):

View File

@ -104,7 +104,7 @@ class LanguageParser(QueryPartParser):
break
# user may set a valid, yet not selectable language
if VALID_LANGUAGE_CODE.match(value):
if VALID_LANGUAGE_CODE.match(value) or value == 'auto':
lang_parts = value.split('-')
if len(lang_parts) > 1:
value = lang_parts[0].lower() + '-' + lang_parts[1].upper()

View File

@ -3,10 +3,12 @@
# pylint: disable=missing-module-docstring, too-few-public-methods
import threading
from copy import copy
from timeit import default_timer
from uuid import uuid4
import flask
import babel
from searx import settings
from searx.answerers import ask
@ -20,6 +22,7 @@ from searx.network import initialize as initialize_network, check_network_config
from searx.metrics import initialize as initialize_metrics, counter_inc, histogram_observe_time
from searx.search.processors import PROCESSORS, initialize as initialize_processors
from searx.search.checker import initialize as initialize_checker
from searx.utils import detect_language
logger = logger.getChild('search')
@ -37,18 +40,57 @@ def initialize(settings_engines=None, enable_checker=False, check_network=False,
initialize_checker()
def replace_auto_language(search_query: SearchQuery):
"""
Do nothing except if `search_query.lang` is "auto".
In this case:
* the value "auto" is replaced by the detected language of the query.
The default value is "all" when no language is detected.
* `search_query.locale` is updated accordingly
Use :py:obj:`searx.utils.detect_language` with `only_search_languages=True` to keep
only languages supported by the engines.
"""
if search_query.lang != 'auto':
return
detected_lang = detect_language(search_query.query, threshold=0.0, only_search_languages=True)
if detected_lang is None:
# fallback to 'all' if no language has been detected
search_query.lang = 'all'
search_query.locale = None
return
search_query.lang = detected_lang
try:
search_query.locale = babel.Locale.parse(search_query.lang)
except babel.core.UnknownLocaleError:
search_query.locale = None
class Search:
"""Search information container"""
__slots__ = "search_query", "result_container", "start_time", "actual_timeout"
def __init__(self, search_query: SearchQuery):
"""Initialize the Search
search_query is copied
"""
# init vars
super().__init__()
self.search_query = search_query
self.result_container = ResultContainer()
self.start_time = None
self.actual_timeout = None
self.search_query = copy(search_query)
self.update_search_query(self.search_query)
def update_search_query(self, search_query: SearchQuery):
"""Update search_query.
call replace_auto_language to replace the "auto" language
"""
replace_auto_language(search_query)
def search_external_bang(self):
"""

View File

@ -109,3 +109,16 @@ class SearchQuery:
self.external_bang,
)
)
def __copy__(self):
return SearchQuery(
self.query,
self.engineref_list,
self.lang,
self.safesearch,
self.pageno,
self.time_range,
self.timeout_limit,
self.external_bang,
self.engine_data,
)

View File

@ -18,7 +18,7 @@ searx_dir = abspath(dirname(__file__))
logger = logging.getLogger('searx')
OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
LANGUAGE_CODES = ['all'] + list(l[0] for l in languages)
LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages)
SIMPLE_STYLE = ('auto', 'light', 'dark')
CATEGORIES_AS_TABS = {
'general': {},

View File

@ -1,5 +1,9 @@
<select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>
{{- _('Auto-detect') -}}
{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%}
</option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}

View File

@ -116,12 +116,15 @@
<p class="value">{{- '' -}}
<select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option>
{%- endfor -%}
</select>{{- '' -}}
</p>
<div class="description" id="desc_language">{{ _('What language do you prefer for search?') }}</div>
<div class="description" id="desc_language">
{{- _('What language do you prefer for search?') }} {{ _('Choose Auto-detect to let SearXNG detect the language of your query.') -}}
</div>
</fieldset>
{% endif %}
{% if 'autocomplete' not in locked_preferences %}

View File

@ -53,6 +53,9 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
"""fasttext model to predict laguage of a search term"""
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes])
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
class _NotSetClass: # pylint: disable=too-few-public-methods
"""Internal class for this module, do not create instance of this class.
@ -637,11 +640,72 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText":
return _FASTTEXT_MODEL
def detect_language(text: str, threshold: float = 0.3, min_probability: float = 0.5) -> Optional[str]:
"""https://fasttext.cc/docs/en/language-identification.html"""
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
"""Detect the language of the ``text`` parameter.
:param str text: The string whose language is to be detected.
:param float threshold: Threshold filters the returned labels by a threshold
on probability. A choice of 0.3 will return labels with at least 0.3
probability.
:param bool only_search_languages: If ``True``, returns only supported
SearXNG search languages. see :py:obj:`searx.languages`
:rtype: str, None
:returns:
The detected language code or ``None``. See below.
:raises ValueError: If ``text`` is not a string.
The language detection is done by using `a fork`_ of the fastText_ library
(`python fasttext`_). fastText_ distributes the `language identification
model`_, for reference:
- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_
The `language identification model`_ support the language codes
(ISO-639-3)::
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
By using ``only_search_languages=True`` the `language identification model`_
is harmonized with the SearXNG's language (locale) model. General
conditions of SearXNG's locale model are:
a. SearXNG's locale of a query is passed to the
:py:obj:`searx.locales.get_engine_locale` to get a language and/or region
code that is used by an engine.
b. Most of SearXNG's engines do not support all the languages from `language
identification model`_ and there is also a discrepancy in the ISO-639-3
(fastext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
(``zh_Hans``) while the `language identification model`_ reduce both to
``zh``.
.. _a fork: https://github.com/searxng/fasttext-predict
.. _fastText: https://fasttext.cc/
.. _python fasttext: https://pypi.org/project/fasttext/
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
"""
if not isinstance(text, str):
raise ValueError('text must a str')
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0 and r[1][0] > min_probability:
return r[0][0].split('__label__')[1]
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
language = r[0][0].split('__label__')[1]
if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
return None
return language
return None

View File

@ -63,7 +63,7 @@ def parse_lang(preferences: Preferences, form: Dict[str, str], raw_text_query: R
query_lang = preferences.get_value('language')
# check language
if not VALID_LANGUAGE_CODE.match(query_lang):
if not VALID_LANGUAGE_CODE.match(query_lang) and query_lang != 'auto':
raise SearxParameterException('language', query_lang)
return query_lang

View File

@ -810,6 +810,9 @@ def search():
)
)
# search_query.lang contains the user choice (all, auto, en, ...)
# when the user choice is "auto", search.search_query.lang contains the detected language
# otherwise it is equals to search_query.lang
return render(
# fmt: off
'results.html',
@ -834,6 +837,11 @@ def search():
settings['search']['languages'],
fallback=request.preferences.get_value("language")
),
search_language = match_language(
search.search_query.lang,
settings['search']['languages'],
fallback=request.preferences.get_value("language")
),
timeout_limit = request.form.get('timeout_limit', None)
# fmt: on
)

View File

@ -91,6 +91,17 @@ class TestLanguageParser(SearxTestCase):
self.assertIn('all', query.languages)
self.assertFalse(query.specific)
def test_auto_language_code(self):
language = 'auto'
query_text = 'una consulta'
full_query = ':' + language + ' ' + query_text
query = RawTextQuery(full_query, [])
self.assertEqual(query.getFullQuery(), full_query)
self.assertEqual(len(query.query_parts), 1)
self.assertIn('auto', query.languages)
self.assertFalse(query.specific)
def test_invalid_language_code(self):
language = 'not_a_language'
query_text = 'the query'

View File

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
from copy import copy
import searx.search
from searx.search import SearchQuery, EngineRef
from searx import settings
@ -34,6 +36,11 @@ class SearchQueryTestCase(SearxTestCase):
self.assertEqual(s, s)
self.assertNotEqual(s, t)
def test_copy(self):
s = SearchQuery('test', [EngineRef('bing', 'general')], 'all', 0, 1, None, None, None)
t = copy(s)
self.assertEqual(s, t)
class SearchTestCase(SearxTestCase):
@classmethod