@ -1,12 +1,24 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
""" Yahoo (Web)
""" Yahoo Search (Web)
Languages are supported by mapping the language to a domain . If domain is not
found in : py : obj : ` lang2domain ` URL ` ` < lang > . search . yahoo . com ` ` is used .
"""
from urllib . parse import unquote , urlencode
from urllib . parse import (
unquote ,
urlencode ,
)
from lxml import html
from searx . utils import extract_text , extract_url , match_language , eval_xpath
from searx . utils import (
eval_xpath_getindex ,
eval_xpath_list ,
extract_text ,
match_language ,
)
# about
about = {
@ -22,35 +34,78 @@ about = {
categories = [ ' general ' ]
paging = True
time_range_support = True
# search-url
base_url = ' https://search.yahoo.com/ '
search_url = ' search? {query} &b= {offset} &fl=1&vl=lang_ {lang} '
search_url_with_time = ' search? {query} &b= {offset} &fl=1&vl=lang_ {lang} &age= {age} &btf= {btf} &fr2=time '
supported_languages_url = ' https://search.yahoo.com/web/advanced '
# specific xpath variables
results_xpath = " //div[contains(concat( ' ' , normalize-space(@class), ' ' ), ' Sr ' )] "
url_xpath = ' .//h3/a/@href '
title_xpath = ' .//h3/a '
content_xpath = ' .//div[contains(@class, " compText " )] '
suggestion_xpath = " //div[contains(concat( ' ' , normalize-space(@class), ' ' ), ' AlsoTry ' )]//a "
supported_languages_url = ' https://search.yahoo.com/preferences/languages '
""" Supported languages are read from Yahoo preference page. """
time_range_dict = {
' day ' : [ ' 1d ' , ' d ' ] ,
' week ' : [ ' 1w ' , ' w ' ] ,
' month ' : [ ' 1m ' , ' m ' ]
' day ' : ( ' 1d ' , ' d ' ) ,
' week ' : ( ' 1w ' , ' w ' ) ,
' month ' : ( ' 1m ' , ' m ' ) ,
}
language_aliases = {
' zh- CN' : ' zh-CHS ' ,
' zh- TW' : ' zh-CHT ' ,
' zh- HK' : ' zh-CHT '
' zh-HK ' : ' zh_chs ' ,
' zh-CN ' : ' zh_chs ' , # dead since 2015 / routed to hk.search.yahoo.com
' zh-TW ' : ' zh_cht ' ,
}
# remove yahoo-specific tracking-url
lang2domain = {
' zh_chs ' : ' hk.search.yahoo.com ' ,
' zh_cht ' : ' tw.search.yahoo.com ' ,
' en ' : ' search.yahoo.com ' ,
' bg ' : ' search.yahoo.com ' ,
' cs ' : ' search.yahoo.com ' ,
' da ' : ' search.yahoo.com ' ,
' el ' : ' search.yahoo.com ' ,
' et ' : ' search.yahoo.com ' ,
' he ' : ' search.yahoo.com ' ,
' hr ' : ' search.yahoo.com ' ,
' ja ' : ' search.yahoo.com ' ,
' ko ' : ' search.yahoo.com ' ,
' sk ' : ' search.yahoo.com ' ,
' sl ' : ' search.yahoo.com ' ,
}
""" Map language to domain """
def _get_language ( params ) :
lang = language_aliases . get ( params [ ' language ' ] )
if lang is None :
lang = match_language (
params [ ' language ' ] , supported_languages , language_aliases
)
lang = lang . split ( ' - ' ) [ 0 ]
logger . debug ( " params[ ' language ' ]: %s --> %s " , params [ ' language ' ] , lang )
return lang
def request ( query , params ) :
""" build request """
offset = ( params [ ' pageno ' ] - 1 ) * 7 + 1
lang = _get_language ( params )
age , btf = time_range_dict . get (
params [ ' time_range ' ] , ( ' ' , ' ' ) )
args = urlencode ( {
' p ' : query ,
' ei ' : ' UTF-8 ' ,
' fl ' : 1 ,
' vl ' : ' lang_ ' + lang ,
' btf ' : btf ,
' fr2 ' : ' time ' ,
' age ' : age ,
' b ' : offset ,
' xargs ' : 0
} )
domain = lang2domain . get ( lang , ' %s .search.yahoo.com ' % lang )
params [ ' url ' ] = ' https:// %s /search? %s ' % ( domain , args )
return params
def parse_url ( url_string ) :
""" remove yahoo-specific tracking-url """
endings = [ ' /RS ' , ' /RK ' ]
endpositions = [ ]
start = url_string . find ( ' http ' , url_string . find ( ' /RU= ' ) + 1 )
@ -66,73 +121,30 @@ def parse_url(url_string):
end = min ( endpositions )
return unquote ( url_string [ start : end ] )
def _get_url ( query , offset , language , time_range ) :
if time_range in time_range_dict :
return base_url + search_url_with_time . format (
offset = offset ,
query = urlencode ( { ' p ' : query } ) ,
lang = language ,
age = time_range_dict [ time_range ] [ 0 ] ,
btf = time_range_dict [ time_range ] [ 1 ]
)
return base_url + search_url . format (
offset = offset ,
query = urlencode ( { ' p ' : query } ) ,
lang = language
)
def _get_language ( params ) :
if params [ ' language ' ] == ' all ' :
return ' en '
language = match_language ( params [ ' language ' ] , supported_languages , language_aliases )
if language not in language_aliases . values ( ) :
language = language . split ( ' - ' ) [ 0 ]
language = language . replace ( ' - ' , ' _ ' ) . lower ( )
return language
# do search-request
def request ( query , params ) :
if params [ ' time_range ' ] and params [ ' time_range ' ] not in time_range_dict :
return params
offset = ( params [ ' pageno ' ] - 1 ) * 10 + 1
language = _get_language ( params )
params [ ' url ' ] = _get_url ( query , offset , language , params [ ' time_range ' ] )
return params
# get response from search-request
def response ( resp ) :
results = [ ]
""" parse response """
results = [ ]
dom = html . fromstring ( resp . text )
try :
results_num = int (
eval_xpath (
dom ,
' //div[@class= " compPagination " ]/span[last()]/text() '
) [ 0 ] . split ( ) [ 0 ] . replace ( ' , ' , ' ' )
)
results . append ( { ' number_of_results ' : results_num } )
except : # pylint: disable=bare-except
pass
# parse results
for result in eval_xpath ( dom , results_xpath ) :
try :
url = parse_url ( extract_url ( eval_xpath ( result , url_xpath ) , search_url ) )
title = extract_text ( eval_xpath ( result , title_xpath ) [ 0 ] )
for result in eval_xpath_list ( dom , ' //div[contains(@class, " algo-sr " )] ' ) :
url = eval_xpath_getindex ( result , ' .//h3/a/@href ' , 0 , default = None )
if url is None :
continue
url = parse_url ( url )
except : # pylint: disable=bare-except
title = eval_xpath_getindex ( result , ' .//h3/a ' , 0 , default = None )
if title is None :
continue
offset = len ( extract_text ( title . xpath ( ' span ' ) ) )
title = extract_text ( title ) [ offset : ]
content = extract_text ( eval_xpath ( result , content_xpath ) [ 0 ] )
content = eval_xpath_getindex (
result , ' .//div[contains(@class, " compText " )] ' , 0 , default = ' '
)
if content :
content = extract_text ( content )
# append result
results . append ( {
@ -141,17 +153,10 @@ def response(resp):
' content ' : content
} )
# if no suggestion found, return results
suggestions = eval_xpath ( dom , suggestion_xpath )
if not suggestions :
return results
# parse suggestion
for suggestion in suggestions :
for suggestion in eval_xpath_list ( dom , ' //div[contains(@class, " AlsoTry " )] ' ) :
# append suggestion
results . append ( { ' suggestion ' : extract_text ( suggestion ) } )
# return results
return results
@ -159,13 +164,9 @@ def response(resp):
def _fetch_supported_languages ( resp ) :
supported_languages = [ ]
dom = html . fromstring ( resp . text )
options = eval_xpath ( dom , ' //div[@id= " yschlang " ]/span/label/input ' )
for option in options :
code_parts = eval_xpath ( option , ' ./@value ' ) [ 0 ] [ 5 : ] . split ( ' _ ' )
if len ( code_parts ) == 2 :
code = code_parts [ 0 ] + ' - ' + code_parts [ 1 ] . upper ( )
else :
code = code_parts [ 0 ]
supported_languages . append ( code )
offset = len ( ' lang_ ' )
for val in eval_xpath_list ( dom , ' //div[contains(@class, " lang-item " )]/input/@value ' ) :
supported_languages . append ( val [ offset : ] )
return supported_languages