From 6e06618e0c44b52d5d322a7f5d35f1a8a7a7d247 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 26 Nov 2021 17:11:20 +0100 Subject: [PATCH] [fix] google-videos engine: ignore news articles In the video search, google also sometimes includes news. E.g. in the DE language when you search for `!gov paris`, google adds an article from a german newspaper (FAZ), I assume these are sponsored link (not tagged advertisement?) Those links do not have an image / this patch ignores *video links* wqithout an image ID. Signed-off-by: Markus Heiser --- searx/engines/google_videos.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index abf046f4..77b0ab26 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -154,25 +154,23 @@ def response(resp): # parse results for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): - # google *sections* + # ignore google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring ") continue - title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) - url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) - - # + # ingnore articles without an image id / e.g. news articles img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) if img_id is None: - logger.error("no img_id for: %s" % result) + logger.error("no img_id found in item %s (news article?)", len(results) + 1) continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: - logger.error("no vidthumb imgdata for: %s" % img_id) img_src = thumbs_src.get(img_id, "") + title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) + url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) length = extract_text(eval_xpath( result, './/div[contains(@class, "P7xzyf")]/span/span')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)