@ -14,18 +14,76 @@ from lxml import html
from searx . poolrequests import get
from searx . engines . xpath import extract_text , extract_url
# engine dependent config
categories = [ ' general ' ]
paging = True
language_support = True
use_locale_domain = True
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
default_hostname = ' www.google.com '
country_to_hostname = {
' BG ' : ' www.google.bg ' , # Bulgaria
' CZ ' : ' www.google.cz ' , # Czech Republic
' DE ' : ' www.google.de ' , # Germany
' DK ' : ' www.google.dk ' , # Denmark
' AT ' : ' www.google.at ' , # Austria
' CH ' : ' www.google.ch ' , # Switzerland
' GR ' : ' www.google.gr ' , # Greece
' AU ' : ' www.google.com.au ' , # Australia
' CA ' : ' www.google.ca ' , # Canada
' GB ' : ' www.google.co.uk ' , # United Kingdom
' ID ' : ' www.google.co.id ' , # Indonesia
' IE ' : ' www.google.ie ' , # Ireland
' IN ' : ' www.google.co.in ' , # India
' MY ' : ' www.google.com.my ' , # Malaysia
' NZ ' : ' www.google.co.nz ' , # New Zealand
' PH ' : ' www.google.com.ph ' , # Philippines
' SG ' : ' www.google.com.sg ' , # Singapore
# 'US': 'www.google.us', # United State, redirect to .com
' ZA ' : ' www.google.co.za ' , # South Africa
' AR ' : ' www.google.com.ar ' , # Argentina
' CL ' : ' www.google.cl ' , # Chile
' ES ' : ' www.google.es ' , # Span
' MX ' : ' www.google.com.mx ' , # Mexico
' EE ' : ' www.google.ee ' , # Estonia
' FI ' : ' www.google.fi ' , # Finland
' BE ' : ' www.google.be ' , # Belgium
' FR ' : ' www.google.fr ' , # France
' IL ' : ' www.google.co.il ' , # Israel
' HR ' : ' www.google.hr ' , # Croatia
' HU ' : ' www.google.hu ' , # Hungary
' IT ' : ' www.google.it ' , # Italy
' JP ' : ' www.google.co.jp ' , # Japan
' KR ' : ' www.google.co.kr ' , # South Korean
' LT ' : ' www.google.lt ' , # Lithuania
' LV ' : ' www.google.lv ' , # Latvia
' NO ' : ' www.google.no ' , # Norway
' NL ' : ' www.google.nl ' , # Netherlands
' PL ' : ' www.google.pl ' , # Poland
' BR ' : ' www.google.com.br ' , # Brazil
' PT ' : ' www.google.pt ' , # Portugal
' RO ' : ' www.google.ro ' , # Romania
' RU ' : ' www.google.ru ' , # Russia
' SK ' : ' www.google.sk ' , # Slovakia
' SL ' : ' www.google.si ' , # Slovenia (SL -> si)
' SE ' : ' www.google.se ' , # Sweden
' TH ' : ' www.google.co.th ' , # Thailand
' TR ' : ' www.google.com.tr ' , # Turkey
' UA ' : ' www.google.com.ua ' , # Ikraine
# 'CN': 'www.google.cn', # China, only from china ?
' HK ' : ' www.google.com.hk ' , # Hong kong
' TW ' : ' www.google.com.tw ' # Taiwan
}
# search-url
google_hostname = ' www.google.com '
search_path = ' /search '
maps_path = ' /maps/ '
redirect_path = ' /url '
images_path = ' /images '
search_url = ( ' https:// ' +
google_hostname +
search_url = ( ' https:// {hostname} ' +
search_path +
' ? {query} &start= {offset} &gbv=1 ' )
@ -34,6 +92,7 @@ results_xpath = '//li[@class="g"]'
url_xpath = ' .//h3/a/@href '
title_xpath = ' .//h3 '
content_xpath = ' .//span[@class= " st " ] '
content_misc_xpath = ' .//div[@class= " f slp " ] '
suggestion_xpath = ' //p[@class= " _Bmc " ] '
images_xpath = ' .//div/a '
@ -41,6 +100,7 @@ image_url_xpath = './@href'
image_img_src_xpath = ' ./img/@src '
pref_cookie = ' '
nid_cookie = { }
# see https://support.google.com/websearch/answer/873?hl=en
@ -52,8 +112,16 @@ def get_google_pref_cookie():
return pref_cookie
def get_google_nid_cookie ( google_hostname ) :
global nid_cookie
if google_hostname not in nid_cookie :
resp = get ( ' https:// ' + google_hostname )
nid_cookie [ google_hostname ] = resp . cookies . get ( " NID " , None )
return nid_cookie [ google_hostname ]
# remove google-specific tracking-url
def parse_url ( url_string ) :
def parse_url ( url_string , google_hostname ):
parsed_url = urlparse ( url_string )
if ( parsed_url . netloc in [ google_hostname , ' ' ]
and parsed_url . path == redirect_path ) :
@ -63,21 +131,45 @@ def parse_url(url_string):
return url_string
# returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom ( result , xpath ) :
r = result . xpath ( xpath )
if len ( r ) > 0 :
return extract_text ( r [ 0 ] )
return None
# do search-request
def request ( query , params ) :
offset = ( params [ ' pageno ' ] - 1 ) * 10
if params [ ' language ' ] == ' all ' :
language = ' en '
country = ' US '
else :
language = params [ ' language ' ] . replace ( ' _ ' , ' - ' ) . lower ( )
language_array = params [ ' language ' ] . lower ( ) . split ( ' _ ' )
if len ( language_array ) == 2 :
country = language_array [ 1 ]
else :
country = ' '
language = language_array [ 0 ] + ' , ' + language_array [ 0 ] + ' - ' + country
if use_locale_domain :
google_hostname = country_to_hostname . get ( country . upper ( ) , default_hostname )
else :
google_hostname = default_hostname
params [ ' url ' ] = search_url . format ( offset = offset ,
query = urlencode ( { ' q ' : query } ) )
query = urlencode ( { ' q ' : query } ) ,
hostname = google_hostname )
params [ ' headers ' ] [ ' Accept-Language ' ] = language
if language . startswith ( ' en ' ) :
params [ ' headers ' ] [ ' Accept ' ] = ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 '
if google_hostname == default_hostname :
params [ ' cookies ' ] [ ' PREF ' ] = get_google_pref_cookie ( )
params [ ' cookies ' ] [ ' NID ' ] = get_google_nid_cookie ( google_hostname )
params [ ' google_hostname ' ] = google_hostname
return params
@ -86,17 +178,30 @@ def request(query, params):
def response ( resp ) :
results = [ ]
# detect google sorry
resp_url = urlparse ( resp . url )
if resp_url . netloc == ' sorry.google.com ' or resp_url . path == ' /sorry/IndexRedirect ' :
raise RuntimeWarning ( ' sorry.google.com ' )
# which hostname ?
google_hostname = resp . search_params . get ( ' google_hostname ' )
google_url = " https:// " + google_hostname
# convert the text to dom
dom = html . fromstring ( resp . text )
# parse results
for result in dom . xpath ( results_xpath ) :
title = extract_text ( result . xpath ( title_xpath ) [ 0 ] )
try :
url = parse_url ( extract_url ( result . xpath ( url_xpath ) , search_url ) )
parsed_url = urlparse ( url )
url = parse_url ( extract_url ( result . xpath ( url_xpath ) , google_url) , google_hostname )
parsed_url = urlparse ( url , google_hostname )
if ( parsed_url . netloc == google_hostname
and parsed_url . path == search_path ) :
# remove the link to google news
and ( parsed_url . path == search_path
or parsed_url . path . startswith ( maps_path ) ) ) :
# remove the link to google news and google maps
# FIXME : sometimes the URL is https://maps.google.*/maps
# no consequence, the result trigger an exception after which is ignored
continue
# images result
@ -104,16 +209,21 @@ def response(resp):
and parsed_url . path == images_path ) :
# only thumbnail image provided,
# so skipping image results
# results = results + parse_images(result )
# results = results + parse_images(result , google_hostname )
pass
else :
# normal result
content = extract_text ( result . xpath ( content_xpath ) [ 0 ] )
content = extract_text_from_dom ( result , content_xpath )
if content is None :
continue
content_misc = extract_text_from_dom ( result , content_misc_xpath )
if content_misc is not None :
content = content_misc + " <br /> " + content
# append result
results . append ( { ' url ' : url ,
' title ' : title ,
' content ' : content } )
except :
except Exception :
continue
# parse suggestion
@ -125,10 +235,10 @@ def response(resp):
return results
def parse_images ( result ):
def parse_images ( result , google_hostname ):
results = [ ]
for image in result . xpath ( images_xpath ) :
url = parse_url ( extract_text ( image . xpath ( image_url_xpath ) [ 0 ] ) )
url = parse_url ( extract_text ( image . xpath ( image_url_xpath ) [ 0 ] ) , google_hostname )
img_src = extract_text ( image . xpath ( image_img_src_xpath ) [ 0 ] )
# append result