@ -5,10 +5,54 @@ are shared by other engines:
- : ref : ` wikidata engine `
The list of supported languages is fetched from the article linked by
: py : obj : ` wikipedia_article_depth ` . Unlike traditional search engines , wikipedia
does not support one Wikipedia for all the languages , but there is one Wikipedia
for every language ( : py : obj : ` fetch_traits ` ) .
The list of supported languages is : py : obj : ` fetched < fetch_wikimedia_traits > ` from
the article linked by : py : obj : ` list_of_wikipedias ` .
Unlike traditional search engines , wikipedia does not support one Wikipedia for
all languages , but there is one Wikipedia for each supported language . Some of
these Wikipedias have a LanguageConverter_ enabled
( : py : obj : ` rest_v1_summary_url ` ) .
A LanguageConverter_ ( LC ) is a system based on language variants that
automatically converts the content of a page into a different variant . A variant
is mostly the same language in a different script .
- ` Wikipedias in multiple writing systems ` _
- ` Automatic conversion between traditional and simplified Chinese characters ` _
PR - 2554 _ :
The Wikipedia link returned by the API is still the same in all cases
( ` https : / / zh . wikipedia . org / wiki / 出租車 ` _ ) but if your browser ' s
` ` Accept - Language ` ` is set to any of ` ` zh ` ` , ` ` zh - CN ` ` , ` ` zh - TW ` ` , ` ` zh - HK ` `
or . . Wikipedia ' s LC automatically returns the desired script in their
web - page .
- You can test the API here : https : / / reqbin . com / gesg2kvx
. . _https : / / zh . wikipedia . org / wiki / 出租車 :
https : / / zh . wikipedia . org / wiki / % E5 % 87 % BA % E7 % A7 % 9 F % E8 % BB % 8 A
To support Wikipedia ' s LanguageConverter_, a SearXNG request to Wikipedia uses
: py : obj : ` get_wiki_params ` and : py : obj : ` wiki_lc_locale_variants ' in the
: py : obj : ` fetch_wikimedia_traits ` function .
To test in SearXNG , query for ` ` ! wp 出租車 ` ` with each of the available Chinese
options :
- ` ` ! wp 出租車 : zh ` ` should show 出租車
- ` ` ! wp 出租車 : zh - CN ` ` should show 出租车
- ` ` ! wp 出租車 : zh - TW ` ` should show 計程車
- ` ` ! wp 出租車 : zh - HK ` ` should show 的士
- ` ` ! wp 出租車 : zh - SG ` ` should show 德士
. . _LanguageConverter :
https : / / www . mediawiki . org / wiki / Writing_systems #LanguageConverter
. . _Wikipedias in multiple writing systems :
https : / / meta . wikimedia . org / wiki / Wikipedias_in_multiple_writing_systems
. . _Automatic conversion between traditional and simplified Chinese characters :
https : / / en . wikipedia . org / wiki / Chinese_Wikipedia #Automatic_conversion_between_traditional_and_simplified_Chinese_characters
. . _PR - 2554 : https : / / github . com / searx / searx / pull / 2554
"""
import urllib . parse
@ -16,8 +60,9 @@ import babel
from lxml import html
from searx import utils
from searx import network
from searx . locales import language_tag
from searx import locales
from searx . enginelib . traits import EngineTraits
traits : EngineTraits
@ -33,6 +78,12 @@ about = {
}
send_accept_language_header = True
""" The HTTP ``Accept-Language`` header is needed for wikis where
LanguageConverter_ is enabled . """
list_of_wikipedias = ' https://meta.wikimedia.org/wiki/List_of_Wikipedias '
""" `List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
"""
wikipedia_article_depth = ' https://meta.wikimedia.org/wiki/Wikipedia_article_depth '
""" The *editing depth* of Wikipedia is one of several possible rough indicators
@ -41,29 +92,68 @@ are updated. The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized .
"""
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = ' https:// {wiki_netloc} /api/rest_v1/page/summary/ {title} '
""" `wikipedia rest_v1 summary API`_: The summary response includes an extract of
the first paragraph of the page in plain text and HTML as well as the type of
page . This is useful for page previews ( fka . Hovercards , aka . Popups ) on the web
and link previews in the apps .
"""
` wikipedia rest_v1 summary API ` _ :
The summary response includes an extract of the first paragraph of the page in
plain text and HTML as well as the type of page . This is useful for page
previews ( fka . Hovercards , aka . Popups ) on the web and link previews in the
apps .
HTTP ` ` Accept - Language ` ` header ( : py : obj : ` send_accept_language_header ` ) :
The desired language variant code for wikis where LanguageConverter_ is
enabled .
. . _wikipedia rest_v1 summary API : https : / / en . wikipedia . org / api / rest_v1 / #/Page%20content/get_page_summary__title_
. . _wikipedia rest_v1 summary API :
https : / / en . wikipedia . org / api / rest_v1 / #/Page%20content/get_page_summary__title_
"""
wiki_lc_locale_variants = {
" zh " : (
" zh-CN " ,
" zh-HK " ,
" zh-MO " ,
" zh-MY " ,
" zh-SG " ,
" zh-TW " ,
) ,
" zh-classical " : ( " zh-classical " , ) ,
}
""" Mapping rule of the LanguageConverter_ to map a language and its variants to
a Locale ( used in the HTTP ` ` Accept - Language ` ` header ) . For example see ` LC
Chinese ` _ .
. . _LC Chinese :
https : / / meta . wikimedia . org / wiki / Wikipedias_in_multiple_writing_systems #Chinese
"""
wikipedia_script_variants = {
" zh " : (
" zh_Hant " ,
" zh_Hans " ,
)
}
def get_wiki_params ( sxng_locale , eng_traits ) :
""" Returns the Wikipedia language tag and the netloc that fits to the
` ` sxng_locale ` ` . To support LanguageConverter_ this function rates a locale
( region ) higher than a language ( compare : py : obj : ` wiki_lc_locale_variants ` ) .
"""
eng_tag = eng_traits . get_region ( sxng_locale , eng_traits . get_language ( sxng_locale , ' en ' ) )
wiki_netloc = eng_traits . custom [ ' wiki_netloc ' ] . get ( eng_tag , ' en.wikipedia.org ' )
return eng_tag , wiki_netloc
def request ( query , params ) :
""" Assemble a request (`wikipedia rest_v1 summary API`_). """
if query . islower ( ) :
query = query . title ( )
engine_language = traits . get_language ( params [ ' searxng_locale ' ] , ' en ' )
wiki_netloc = traits . custom [ ' wiki_netloc ' ] . get ( engine_language , ' https://en.wikipedia.org/wiki/ ' )
_eng_tag , wiki_netloc = get_wiki_params ( params [ ' searxng_locale ' ] , traits )
title = urllib . parse . quote ( query )
# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
params [ ' url ' ] = rest_v1_summary_url . format ( wiki_netloc = wiki_netloc , title = title )
params [ ' raise_for_httperror ' ] = False
@ -93,7 +183,7 @@ def response(resp):
network . raise_for_httperror ( resp )
api_result = resp . json ( )
title = api_result[ ' title ' ]
title = utils. html_to_text ( api_result . get ( ' titles ' , { } ) . get ( ' display ' ) or api_result . get ( ' title ' ) )
wikipedia_link = api_result [ ' content_urls ' ] [ ' desktop ' ] [ ' page ' ]
results . append ( { ' url ' : wikipedia_link , ' title ' : title , ' content ' : api_result . get ( ' description ' , ' ' ) } )
@ -116,44 +206,38 @@ def response(resp):
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).
lang_map = {
' be-tarask ' : ' bel ' ,
' ak ' : ' aka ' ,
' als ' : ' gsw ' ,
' bat-smg ' : ' sgs ' ,
' cbk-zam ' : ' cbk ' ,
' fiu-vro ' : ' vro ' ,
' map-bms ' : ' map ' ,
' nrm ' : ' nrf ' ,
' roa-rup ' : ' rup ' ,
' nds-nl ' : ' nds ' ,
#'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
' zh-min-nan ' : ' nan ' ,
' zh-yue ' : ' yue ' ,
' an ' : ' arg ' ,
' zh-classical ' : ' zh-Hant ' , # babel maps classical to zh-Hans (for whatever reason)
}
unknown_langs = [
' an ' , # Aragonese
' ba ' , # Bashkir
' bar ' , # Bavarian
' bcl ' , # Central Bicolano
' be-tarask ' , # Belarusian variant / Belarusian is already covered by 'be'
' bpy ' , # Bishnupriya Manipuri is unknown by babel
' hif ' , # Fiji Hindi
' ilo ' , # Ilokano
' li ' , # Limburgish
' sco ' , # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
' sh ' , # Serbo-Croatian
' simple ' , # simple english is not know as a natural language different to english (babel)
' vo ' , # Volapük
' wa ' , # Walloon
]
lang_map = locales . LOCALE_BEST_MATCH . copy ( )
lang_map . update (
{
' be-tarask ' : ' bel ' ,
' ak ' : ' aka ' ,
' als ' : ' gsw ' ,
' bat-smg ' : ' sgs ' ,
' cbk-zam ' : ' cbk ' ,
' fiu-vro ' : ' vro ' ,
' map-bms ' : ' map ' ,
' no ' : ' nb-NO ' ,
' nrm ' : ' nrf ' ,
' roa-rup ' : ' rup ' ,
' nds-nl ' : ' nds ' ,
#'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
' zh-min-nan ' : ' nan ' ,
' zh-yue ' : ' yue ' ,
' an ' : ' arg ' ,
}
)
def fetch_traits ( engine_traits : EngineTraits ) :
""" Fetch languages from Wikipedia.
fetch_wikimedia_traits ( engine_traits )
print ( " WIKIPEDIA_LANGUAGES: %s " % len ( engine_traits . custom [ ' WIKIPEDIA_LANGUAGES ' ] ) )
def fetch_wikimedia_traits ( engine_traits : EngineTraits ) :
""" Fetch languages from Wikipedia. Not all languages from the
: py : obj : ` list_of_wikipedias ` are supported by SearXNG locales , only those
known from : py : obj : ` searx . locales . LOCALE_NAMES ` or those with a minimal
: py : obj : ` editing depth < wikipedia_article_depth > ` .
The location of the Wikipedia address of a language is mapped in a
: py : obj : ` custom field < searx . enginelib . traits . EngineTraits . custom > `
@ -169,15 +253,21 @@ def fetch_traits(engine_traits: EngineTraits):
" zh " : " zh.wikipedia.org " ,
" zh-classical " : " zh-classical.wikipedia.org "
}
"""
# pylint: disable=too-many-branches
engine_traits . custom [ ' wiki_netloc ' ] = { }
engine_traits . custom [ ' WIKIPEDIA_LANGUAGES ' ] = [ ]
# insert alias to map from a script or region to a wikipedia variant
# insert alias to map from a region like zh-CN to a language zh_Hans
engine_traits . languages [ ' zh_Hans ' ] = ' zh '
for eng_tag , sxng_tag_list in wikipedia_script_variants . items ( ) :
for sxng_tag in sxng_tag_list :
engine_traits . languages [ sxng_tag ] = eng_tag
for eng_tag , sxng_tag_list in wiki_lc_locale_variants . items ( ) :
for sxng_tag in sxng_tag_list :
engine_traits . regions [ sxng_tag ] = eng_tag
resp = network . get ( wikipedia_article_depth )
resp = network . get ( list_of_wikipedias )
if not resp . ok :
print ( " ERROR: response from Wikipedia is not OK. " )
@ -189,30 +279,31 @@ def fetch_traits(engine_traits: EngineTraits):
continue
cols = [ c . text_content ( ) . strip ( ) for c in cols ]
depth = float ( cols [ 3 ] . replace ( ' - ' , ' 0 ' ) . replace ( ' , ' , ' ' ) )
depth = float ( cols [ 11 ] . replace ( ' - ' , ' 0 ' ) . replace ( ' , ' , ' ' ) )
articles = int ( cols [ 4 ] . replace ( ' , ' , ' ' ) . replace ( ' , ' , ' ' ) )
if articles < 10000 :
# exclude languages with too few articles
continue
if int ( depth ) < 20 :
# Rough indicator of a Wikipedia’ s quality, showing how frequently
# its articles are updated.
continue
eng_tag = cols [ 2 ]
wiki_url = row . xpath ( ' ./td[3]/a/@href ' ) [ 0 ]
eng_tag = cols [ 3 ]
wiki_url = row . xpath ( ' ./td[4]/a/@href ' ) [ 0 ]
wiki_url = urllib . parse . urlparse ( wiki_url )
if eng_tag in unknown_langs :
continue
try :
sxng_tag = l anguage_tag( babel . Locale . parse ( lang_map . get ( eng_tag , eng_tag ) , sep = ' - ' ) )
sxng_tag = locales . language_tag ( babel . Locale . parse ( lang_map . get ( eng_tag , eng_tag ) , sep = ' - ' ) )
except babel . UnknownLocaleError :
print ( " ERROR: %s [ %s ] is unknown by babel " % ( cols [ 0 ] , eng_tag ) )
# print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
continue
finally :
engine_traits . custom [ ' WIKIPEDIA_LANGUAGES ' ] . append ( eng_tag )
if sxng_tag not in locales . LOCALE_NAMES :
if articles < 10000 :
# exclude languages with too few articles
continue
if int ( depth ) < 20 :
# Rough indicator of a Wikipedia’ s quality, showing how
# frequently its articles are updated.
continue
conflict = engine_traits . languages . get ( sxng_tag )
if conflict :
@ -222,3 +313,5 @@ def fetch_traits(engine_traits: EngineTraits):
engine_traits . languages [ sxng_tag ] = eng_tag
engine_traits . custom [ ' wiki_netloc ' ] [ eng_tag ] = wiki_url . netloc
engine_traits . custom [ ' WIKIPEDIA_LANGUAGES ' ] . sort ( )