From dd7b53d3690e472fbc3f38c3d02ed377dc841a08 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 18 Jun 2021 11:34:11 +0200 Subject: [PATCH 1/2] [fix] google-news engine - KeyError: 'hl in request Since we added - 1c67b6aec [enh] google engine: supports "default language" there is a KeyError: 'hl in request,error pattern:: ERROR:searx.searx.search.processor.online:engine google news : exception : 'hl' Traceback (most recent call last): File "searx/search/processors/online.py", line 144, in search search_results = self._search_basic(query, params) File "searx/search/processors/online.py", line 118, in _search_basic self.engine.request(query, params) File "searx/engines/google_news.py", line 97, in request if lang_info['hl'] == 'en': KeyError: 'hl' Closes: https://github.com/searxng/searxng/issues/154 Signed-off-by: Markus Heiser --- searx/engines/google_news.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 485d602bc..38cbbd0ea 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -94,8 +94,8 @@ def request(query, params): ceid = "%s:%s" % (lang_info['country'], lang_info['language']) # google news redirects en to en-US - if lang_info['hl'] == 'en': - lang_info['hl'] = 'en-US' + if lang_info['params']['hl'] == 'en': + lang_info['params']['hl'] = 'en-US' # Very special to google-news compared to other google engines, the time # range is included in the search term. From 9328c66e93afacc91564a475280c5167fb0216be Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 18 Jun 2021 11:49:20 +0200 Subject: [PATCH 2/2] [fix] google news - send CONSENT Cookie to not be redirected In the EU there exists a "General Data Protection Regulation" [1] aka GDPR (BTW: very user friendly!) which requires consent to tracking. To get the consent from the user, google-news requests are redirected to confirm and get a CONSENT Cookie from https://consent.google.de/s?continue=... This patch adds a CONSENT Cookie to the google-news request to avoid redirection. The behavior of the CONTENTS cookies over all google engines seems similar but the pattern is not yet fully clear to me, here are some random samples from my analysis .. Using common google search from different domains:: google.com: CONSENT=YES+cb.{{date}}-14-p0.de+FX+816 google.de: CONSENT=YES+cb.{{date}}-14-p0.de+FX+333 google.fr: CONSENT=YES+srp.gws-{{date}}-0-RC2.fr+FX+826 When searching about videos (google-videos):: google.es: CONSENT=YES+srp.gws-{{date}}-0-RC2.es+FX+076 google.de: CONSENT=YES+srp.gws-{{date}}-0-RC2.de+FX+171 Google news has only one domain for all languages:: news.google.com: CONSENT=YES+cb.{{date}}-14-p0.de+FX+816 Using google-scholar search from different domains:: scholar.google.de: CONSENT=YES+cb.{{date}}-14-p0.de+FX+333 scholar.google.fr: does not use such a cookie / did not ask the user scholar.google.es: does not use such a cookie / did not ask the user Interim summary: Pattern is unclear and I won't apply the CONSENT cookie to all google engines. More experience is need before we generalize the CONSENT cookies over all google engines. Related: - e9a6ab401 [fix] youtube - send CONSENT Cookie to not be redirected - https://github.com/benbusby/whoogle-search/issues/311 - https://github.com/benbusby/whoogle-search/issues/243 [1] https://en.wikipedia.org/wiki/General_Data_Protection_Regulation Signed-off-by: Markus Heiser --- searx/engines/google_news.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 38cbbd0ea..c1c97b700 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -19,6 +19,7 @@ Definitions`_. Not all parameters can be appied: # pylint: disable=invalid-name, missing-function-docstring import binascii +from datetime import datetime import re from urllib.parse import urlencode from base64 import b64decode @@ -115,6 +116,7 @@ def request(query, params): params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) + params['headers']['Cookie'] = "CONSENT=YES+cb.%s-14-p0.en+F+941;" % datetime.now().strftime("%Y%m%d") return params