From 16f8ec894a40af8edf30c297afb945ceb32081ca Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 7 Jul 2020 21:59:15 +0200 Subject: [PATCH] [fix] revise google images engine this commit is picked from #1985 --- searx/engines/google_images.py | 246 ++++++++++++++++++++++++--------- 1 file changed, 177 insertions(+), 69 deletions(-) diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 636913114..75264eb9c 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,97 +1,205 @@ -""" - Google (Images) +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Google (Images) + +:website: https://images.google.com (redirected to subdomain www.) +:provide-api: yes (https://developers.google.com/custom-search/) +:using-api: not the offical, since it needs registration to another service +:results: HTML +:stable: no +:template: images.html +:parse: url, title, content, source, thumbnail_src, img_src + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. + +.. _admonition:: Content-Security-Policy (CSP) - @website https://www.google.com - @provide-api yes (https://developers.google.com/custom-search/) + This engine needs to allow images from the `data URLs`_ (prefixed with the + ``data:` scheme).:: + + Header set Content-Security-Policy "img-src 'self' data: ;" + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions - @using-api no - @results HTML chunks with JSON inside - @stable no - @parse url, title, img_src """ -from datetime import date, timedelta -from json import loads from lxml import html -from searx.url_utils import urlencode +from flask_babel import gettext +from searx import logger +from searx.url_utils import urlencode, urlparse +from searx.utils import eval_xpath +from searx.engines.xpath import extract_text + +# pylint: disable=unused-import +from searx.engines.google import ( + supported_languages_url + , _fetch_supported_languages +) +# pylint: enable=unused-import + +from searx.engines.google import ( + get_lang_country + , google_domains + , time_range_dict +) + +logger = logger.getChild('google images') # engine dependent config + categories = ['images'] -paging = True -safesearch = True +paging = False +language_support = True +use_locale_domain = True time_range_support = True -number_of_results = 100 - -search_url = 'https://www.google.com/search'\ - '?{query}'\ - '&tbm=isch'\ - '&yv=2'\ - '&{search_options}' -time_range_attr = "qdr:{range}" -time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm'} +safesearch = True +filter_mapping = { + 0 : 'images', + 1 : 'active', + 2 : 'active' +} + +def scrap_out_thumbs(dom): + """Scrap out thumbnail data from