@ -1,12 +1,45 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Google ( News )
""" Google (News)
For detailed description of the * REST - full * API see : ` Query Parameter
Definitions ` _ . Not all parameters can be appied , e . g . num_ ( the number of
search results to return ) is ignored .
. . _Query Parameter Definitions :
https : / / developers . google . com / custom - search / docs / xml_results #WebSearch_Query_Parameter_Definitions
. . _num : https : / / developers . google . com / custom - search / docs / xml_results #numsp
"""
# pylint: disable=invalid-name, missing-function-docstring
import binascii
import re
from urllib . parse import urlencode
from base64 import b64decode
from lxml import html
from searx . utils import match_language
from searx . engines . google import _fetch_supported_languages , supported_languages_url # NOQA # pylint: disable=unused-import
from searx import logger
from searx . utils import (
eval_xpath ,
eval_xpath_list ,
eval_xpath_getindex ,
extract_text ,
)
# pylint: disable=unused-import
from searx . engines . google import (
supported_languages_url ,
_fetch_supported_languages ,
detect_google_sorry ,
)
# pylint: enable=unused-import
from searx . engines . google import (
get_lang_country ,
filter_mapping ,
)
# about
about = {
@ -18,72 +51,143 @@ about = {
" results " : ' HTML ' ,
}
# search-url
logger = logger . getChild ( ' google news ' )
# compared to other google engines google-news has a different time range
# support. The time range is included in the search term.
time_range_dict = {
' day ' : ' when:1d ' ,
' week ' : ' when:7d ' ,
' month ' : ' when:1m ' ,
' year ' : ' when:1y ' ,
}
# engine dependent config
categories = [ ' news ' ]
paging = True
paging = Fals e
language_support = True
safesearch = True
use_locale_domain = True
time_range_support = True
number_of_results = 10
search_url = ' https://www.google.com/search ' \
' ? {query} ' \
' &tbm=nws ' \
' &gws_rd=cr ' \
' & {search_options} '
time_range_attr = " qdr: {range} "
time_range_dict = { ' day ' : ' d ' ,
' week ' : ' w ' ,
' month ' : ' m ' ,
' year ' : ' y ' }
safesearch = True # not really, but it is not generated by google
# do search-request
def request ( query , params ) :
search_options = {
' start ' : ( params [ ' pageno ' ] - 1 ) * number_of_results
}
if params [ ' time_range ' ] in time_range_dict :
search_options [ ' tbs ' ] = time_range_attr . format ( range = time_range_dict [ params [ ' time_range ' ] ] )
if safesearch and params [ ' safesearch ' ] :
search_options [ ' safe ' ] = ' on '
params [ ' url ' ] = search_url . format ( query = urlencode ( { ' q ' : query } ) ,
search_options = urlencode ( search_options ) )
if params [ ' language ' ] != ' all ' :
language = match_language ( params [ ' language ' ] , supported_languages , language_aliases ) . split ( ' - ' ) [ 0 ]
if language :
params [ ' url ' ] + = ' &hl= ' + language
""" Google-News search request """
language , country , lang_country = get_lang_country (
# pylint: disable=undefined-variable
params , supported_languages , language_aliases
)
subdomain = ' news.google.com '
if params [ ' time_range ' ] : # in time_range_dict:
query + = ' ' + time_range_dict [ params [ ' time_range ' ] ]
query_url = ' https:// ' + subdomain + ' /search ' + " ? " + urlencode ( {
' q ' : query ,
' hl ' : language ,
' lr ' : " lang_ " + language ,
' ie ' : " utf8 " ,
' oe ' : " utf8 " ,
' ceid ' : " %s : %s " % ( country , language ) ,
' gl ' : country ,
} )
if params [ ' safesearch ' ] :
query_url + = ' & ' + urlencode ( { ' safe ' : filter_mapping [ params [ ' safesearch ' ] ] } )
params [ ' url ' ] = query_url
logger . debug ( " query_url --> %s " , query_url )
# en-US,en;q=0.8,en;q=0.5
params [ ' headers ' ] [ ' Accept-Language ' ] = (
lang_country + ' , ' + language + ' ;q=0.8, ' + language + ' ;q=0.5 '
)
logger . debug ( " HTTP header Accept-Language --> %s " ,
params [ ' headers ' ] [ ' Accept-Language ' ] )
params [ ' headers ' ] [ ' Accept ' ] = (
' text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8 '
)
#params['google_subdomain'] = subdomain
return params
# get response from search-request
def response ( resp ) :
""" Get response from google ' s search request """
results = [ ]
detect_google_sorry ( resp )
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom
dom = html . fromstring ( resp . text )
# parse results
for result in dom . xpath ( ' //div[@class= " g " ]|//div[@class= " g _cy " ] ' ) :
try :
r = {
' url ' : result . xpath ( ' .//a[@class= " l lLrAF " ] ' ) [ 0 ] . attrib . get ( " href " ) ,
' title ' : ' ' . join ( result . xpath ( ' .//a[@class= " l lLrAF " ]//text() ' ) ) ,
' content ' : ' ' . join ( result . xpath ( ' .//div[@class= " st " ]//text() ' ) ) ,
}
except :
continue
imgs = result . xpath ( ' .//img/@src ' )
if len ( imgs ) and not imgs [ 0 ] . startswith ( ' data ' ) :
r [ ' img_src ' ] = imgs [ 0 ]
results . append ( r )
for result in eval_xpath_list ( dom , ' //div[@class= " xrnccd " ] ' ) :
# The first <a> tag in the <article> contains the link to the
# article The href attribute of the <a> is a google internal link,
# we can't use. The real link is hidden in the jslog attribute:
#
# <a ...
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
# href="./articles/CAIiENu3nGS...?hl=en-US&gl=US&ceid=US%3Aen"
# ... />
jslog = eval_xpath_getindex ( result , ' ./article/a/@jslog ' , 0 )
url = re . findall ( ' http[^;]* ' , jslog )
if url :
url = url [ 0 ]
else :
# The real URL is base64 encoded in the json attribute:
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
jslog = jslog . split ( " ; " ) [ 1 ] . split ( ' : ' ) [ 1 ] . strip ( )
try :
padding = ( 4 - ( len ( jslog ) % 4 ) ) * " = "
jslog = b64decode ( jslog + padding )
except binascii . Error :
# URL cant be read, skip this result
continue
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
url = re . findall ( ' http[^; " ]* ' , str ( jslog ) ) [ 0 ]
# the first <h3> tag in the <article> contains the title of the link
title = extract_text ( eval_xpath ( result , ' ./article/h3[1] ' ) )
# the first <div> tag in the <article> contains the content of the link
content = extract_text ( eval_xpath ( result , ' ./article/div[1] ' ) )
# the second <div> tag contains origin publisher and the publishing date
pub_date = extract_text ( eval_xpath ( result , ' ./article/div[2]//time ' ) )
pub_origin = extract_text ( eval_xpath ( result , ' ./article/div[2]//a ' ) )
pub_info = [ ]
if pub_origin :
pub_info . append ( pub_origin )
if pub_date :
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_info . append ( pub_date )
pub_info = ' , ' . join ( pub_info )
if pub_info :
content = pub_info + ' : ' + content
# The image URL is located in a preceding sibling <img> tag, e.g.:
# "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
# These URL are long but not personalized (double checked via tor).
img_src = extract_text ( result . xpath ( ' preceding-sibling::a/figure/img/@src ' ) )
results . append ( {
' url ' : url ,
' title ' : title ,
' content ' : content ,
' img_src ' : img_src ,
} )
# return results
return results