@ -1,16 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
""" Yahoo (News)
Yahoo News is " English only " and do not offer localized nor language queries .
"""
Yahoo ( News )
"""
# pylint: disable=invalid-name, missing-function-docstring
import re
from datetime import datetime , timedelta
from urllib . parse import urlencode
from lxml import html
from searx . engines . yahoo import parse_url , language_aliases
from searx . engines . yahoo import _fetch_supported_languages , supported_languages_url # NOQA # pylint: disable=unused-import
from datetime import datetime , timedelta
from dateutil import parser
from searx . utils import extract_text , extract_url , match_language
from lxml import html
from searx import logger
from searx . utils import (
eval_xpath_list ,
eval_xpath_getindex ,
extract_text ,
)
from searx . engines . yahoo import parse_url
# pylint: disable=unused-import
from searx . engines . yahoo import (
_fetch_supported_languages ,
supported_languages_url ,
)
# pylint: enable=unused-import
logger = logger . getChild ( ' yahoo_news engine ' )
# about
about = {
@ -22,90 +41,78 @@ about = {
" results " : ' HTML ' ,
}
# engine dependent config
categories = [ ' news ' ]
language_support = False
time_range_support = False
safesearch = False
paging = True
categories = [ ' news ' ]
# search-url
search_url = ' https://news.search.yahoo.com/search? {query} &b= {offset} & {lang} =uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_ {lang} ' # noqa
# specific xpath variables
results_xpath = ' //ol[contains(@class, " searchCenterMiddle " )]//li '
url_xpath = ' .//h3/a/@href '
title_xpath = ' .//h3/a '
content_xpath = ' .//div[@class= " compText " ] '
publishedDate_xpath = ' .//span[contains(@class, " tri " )] '
suggestion_xpath = ' //div[contains(@class, " VerALSOTRY " )]//a '
search_url = (
' https://news.search.yahoo.com/search '
' ? {query} &b= {offset} '
)
AGO_RE = re . compile ( r ' ([0-9]+) \ s*(year|month|week|day|minute|hour) ' )
AGO_TIMEDELTA = {
' minute ' : timedelta ( minutes = 1 ) ,
' hour ' : timedelta ( hours = 1 ) ,
' day ' : timedelta ( days = 1 ) ,
' week ' : timedelta ( days = 7 ) ,
' month ' : timedelta ( days = 30 ) ,
' year ' : timedelta ( days = 365 ) ,
}
# do search-request
def request ( query , params ) :
offset = ( params [ ' pageno ' ] - 1 ) * 10 + 1
if params [ ' language ' ] == ' all ' :
language = ' en '
else :
language = match_language ( params [ ' language ' ] , supported_languages , language_aliases ) . split ( ' - ' ) [ 0 ]
params [ ' url ' ] = search_url . format ( offset = offset ,
query = urlencode ( { ' p ' : query } ) ,
lang = language )
# TODO required?
params [ ' cookies ' ] [ ' sB ' ] = ' " v=1&vm=p&fl=1&vl=lang_ {lang} &sh=1&pn=10&rw=new ' \
. format ( lang = language )
params [ ' url ' ] = search_url . format (
offset = offset ,
query = urlencode ( { ' p ' : query } )
)
logger . debug ( " query_url --> %s " , params [ ' url ' ] )
return params
def sanitize_url ( url ) :
if " .yahoo.com/ " in url :
return re . sub ( " \\ ; \\ _ylt \\ =.+$ " , " " , url )
else :
return url
# get response from search-request
def response ( resp ) :
results = [ ]
dom = html . fromstring ( resp . text )
# parse results
for result in dom . xpath ( results_xpath ) :
urls = result . xpath ( url_xpath )
if len ( urls ) != 1 :
for result in eval_xpath_list ( dom , ' //ol[contains(@class, " searchCenterMiddle " )]//li ' ) :
url = eval_xpath_getindex ( result , ' .//h4/a/@href ' , 0 , None )
if url is None :
continue
url = sanitize_url ( parse_url ( extract_url ( urls , search_url ) ) )
title = extract_text ( result . xpath ( title_xpath ) [ 0 ] )
content = extract_text ( result . xpath ( content_xpath ) [ 0 ] )
# parse publishedDate
publishedDate = extract_text ( result . xpath ( publishedDate_xpath ) [ 0 ] )
# still useful ?
if re . match ( " ^[0-9]+ minute(s|) ago$ " , publishedDate ) :
publishedDate = datetime . now ( ) - timedelta ( minutes = int ( re . match ( r ' \ d+ ' , publishedDate ) . group ( ) ) )
elif re . match ( " ^[0-9]+ days? ago$ " , publishedDate ) :
publishedDate = datetime . now ( ) - timedelta ( days = int ( re . match ( r ' \ d+ ' , publishedDate ) . group ( ) ) )
elif re . match ( " ^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$ " , publishedDate ) :
timeNumbers = re . findall ( r ' \ d+ ' , publishedDate )
publishedDate = datetime . now ( ) \
- timedelta ( hours = int ( timeNumbers [ 0 ] ) ) \
- timedelta ( minutes = int ( timeNumbers [ 1 ] ) )
url = parse_url ( url )
title = extract_text ( result . xpath ( ' .//h4/a ' ) )
content = extract_text ( result . xpath ( ' .//p ' ) )
img_src = eval_xpath_getindex ( result , ' .//img/@data-src ' , 0 , None )
item = {
' url ' : url ,
' title ' : title ,
' content ' : content ,
' img_src ' : img_src
}
pub_date = extract_text ( result . xpath ( ' .//span[contains(@class, " s-time " )] ' ) )
ago = AGO_RE . search ( pub_date )
if ago :
number = int ( ago . group ( 1 ) )
delta = AGO_TIMEDELTA [ ago . group ( 2 ) ]
pub_date = datetime . now ( ) - delta * number
else :
try :
pub lishedDate = parser . parse ( publishedD ate)
except :
pub lishedDate = datetime . now ( )
pub _date = parser . parse ( pub_d ate)
except parser . ParserError :
pub _date = None
if publishedDate . year == 1900 :
publishedDate = publishedDate . replace ( year = datetime . now ( ) . year )
if pub_date is not None :
item [ ' publishedDate ' ] = pub_date
results . append ( item )
# append result
results . append ( { ' url ' : url ,
' title ' : title ,
' content ' : content ,
' publishedDate ' : publishedDate } )
for suggestion in eval_xpath_list ( dom , ' //div[contains(@class, " AlsoTry " )]//td ' ) :
results . append ( { ' suggestion ' : extract_text ( suggestion ) } )
# return results
return results