[fix] rewrite Yahoo-News engine

Many things have been changed since last review of this engine.  This patch fix
xpath selectors, implements suggestion and is a complete review / rewrite of the
engine.

Signed-off-by: Markus Heiser <markus@darmarit.de>
dependabot/pip/master/sphinx-6.1.3
Markus Heiser 3 years ago
parent 0d8b369b5b
commit d2faea423a

@ -196,6 +196,7 @@ PYLINT_FILES=\
searx/engines/google_images.py \ searx/engines/google_images.py \
searx/engines/mediathekviewweb.py \ searx/engines/mediathekviewweb.py \
searx/engines/google_scholar.py \ searx/engines/google_scholar.py \
searx/engines/yahoo_news.py \
searx_extra/update/update_external_bangs.py searx_extra/update/update_external_bangs.py
test.pylint: pyenvinstall test.pylint: pyenvinstall

@ -1,16 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Yahoo (News)
Yahoo News is "English only" and do not offer localized nor language queries.
""" """
Yahoo (News)
""" # pylint: disable=invalid-name, missing-function-docstring
import re import re
from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from datetime import datetime, timedelta
from searx.engines.yahoo import parse_url, language_aliases
from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
from dateutil import parser from dateutil import parser
from searx.utils import extract_text, extract_url, match_language from lxml import html
from searx import logger
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.yahoo import parse_url
# pylint: disable=unused-import
from searx.engines.yahoo import (
_fetch_supported_languages,
supported_languages_url,
)
# pylint: enable=unused-import
logger = logger.getChild('yahoo_news engine')
# about # about
about = { about = {
@ -22,90 +41,78 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
# engine dependent config language_support = False
categories = ['news'] time_range_support = False
safesearch = False
paging = True paging = True
categories = ['news']
# search-url # search-url
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa search_url = (
'https://news.search.yahoo.com/search'
# specific xpath variables '?{query}&b={offset}'
results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' )
url_xpath = './/h3/a/@href'
title_xpath = './/h3/a' AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
content_xpath = './/div[@class="compText"]' AGO_TIMEDELTA = {
publishedDate_xpath = './/span[contains(@class,"tri")]' 'minute': timedelta(minutes=1),
suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' 'hour': timedelta(hours=1),
'day': timedelta(days=1),
'week': timedelta(days=7),
'month': timedelta(days=30),
'year': timedelta(days=365),
}
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': params['url'] = search_url.format(
language = 'en' offset = offset,
else: query = urlencode({'p': query})
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] )
logger.debug("query_url --> %s", params['url'])
params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
.format(lang=language)
return params return params
def sanitize_url(url):
if ".yahoo.com/" in url:
return re.sub("\\;\\_ylt\\=.+$", "", url)
else:
return url
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
urls = result.xpath(url_xpath)
if len(urls) != 1: url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
if url is None:
continue continue
url = sanitize_url(parse_url(extract_url(urls, search_url))) url = parse_url(url)
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath('.//h4/a'))
content = extract_text(result.xpath(content_xpath)[0]) content = extract_text(result.xpath('.//p'))
img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
# parse publishedDate
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) item = {
'url': url,
# still useful ? 'title': title,
if re.match("^[0-9]+ minute(s|) ago$", publishedDate): 'content': content,
publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) 'img_src' : img_src
elif re.match("^[0-9]+ days? ago$", publishedDate): }
publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group()))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
timeNumbers = re.findall(r'\d+', publishedDate) ago = AGO_RE.search(pub_date)
publishedDate = datetime.now()\ if ago:
- timedelta(hours=int(timeNumbers[0]))\ number = int(ago.group(1))
- timedelta(minutes=int(timeNumbers[1])) delta = AGO_TIMEDELTA[ago.group(2)]
pub_date = datetime.now() - delta * number
else: else:
try: try:
publishedDate = parser.parse(publishedDate) pub_date = parser.parse(pub_date)
except: except parser.ParserError:
publishedDate = datetime.now() pub_date = None
if publishedDate.year == 1900: if pub_date is not None:
publishedDate = publishedDate.replace(year=datetime.now().year) item['publishedDate'] = pub_date
results.append(item)
# append result for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
results.append({'url': url, results.append({'suggestion': extract_text(suggestion)})
'title': title,
'content': content,
'publishedDate': publishedDate})
# return results
return results return results

Loading…
Cancel
Save