[fix] engine google-News: fix decoding of URLs

Google-News returns internal links where the origin URL is encoded in a
base64 (RFC 2045 aka URL-safe) string.

Closes: https://github.com/searxng/searxng/issues/1959
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2023-04-01 19:33:13 +02:00
parent 7592d85982
commit 8de8070ed9

View File

@ -27,10 +27,8 @@ The google news API ignores some parameters from the common :ref:`google API`:
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import binascii
import re
from urllib.parse import urlencode from urllib.parse import urlencode
from base64 import b64decode import base64
from lxml import html from lxml import html
import babel import babel
@ -144,34 +142,17 @@ def response(resp):
for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
# The first <a> tag in the <article> contains the link to the # The first <a> tag in the <article> contains the link to the article
# article The href attribute of the <a> is a google internal link, # The href attribute of the <a> tag is a google internal link, we have
# we can't use. The real link is hidden in the jslog attribute: # to decode
#
# <a ...
# jslog="95014; 4:https://www.cnn.com/.../index.html; track:click"
# href="./articles/CAIiENu3nGS...?hl=en-US&amp;gl=US&amp;ceid=US%3Aen"
# ... />
jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) href = eval_xpath_getindex(result, './article/a/@href', 0)
url = re.findall('http[^;]*', jslog) href = href.split('?')[0]
if url: href = href.split('/')[-1]
url = url[0] href = base64.urlsafe_b64decode(href + '====')
else: href = href[4:].split(b'\xd2')[0]
# The real URL is base64 encoded in the json attribute: href = href.decode()
# jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click"
jslog = jslog.split(";")[1].split(':')[1].strip()
try:
padding = (4 - (len(jslog) % 4)) * "="
jslog = b64decode(jslog + padding)
except binascii.Error:
# URL can't be read, skip this result
continue
# now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]'
url = re.findall('http[^;"]*', str(jslog))[0]
# the first <h3> tag in the <article> contains the title of the link
title = extract_text(eval_xpath(result, './article/h3[1]')) title = extract_text(eval_xpath(result, './article/h3[1]'))
# The pub_date is mostly a string like 'yesertday', not a real # The pub_date is mostly a string like 'yesertday', not a real
@ -189,7 +170,7 @@ def response(resp):
results.append( results.append(
{ {
'url': url, 'url': href,
'title': title, 'title': title,
'content': content, 'content': content,
'img_src': img_src, 'img_src': img_src,