From 880555e2637a3517fefdda77edb1dbe75ee7f066 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Wed, 27 Oct 2021 03:04:52 +0200 Subject: [PATCH 1/5] [enh] engine - add Tineye reverse image search Other optional parameter .. `&sort=crawl_date` can be appended to search_string to sort results by date. `&domain=example.org` can be implemented to search_string to get results from just one domain. Public instances could get relatively fast timed-out for 3600s. -- Merged from @allendema's commit [1] and slightly modfied / see [2]. Related-to: [1] https://github.com/allendema/searx/commit/455b2b4460cd830ac1f7e62e824040e2fe648de9 Related-to: [2] https://github.com/searx/searx/pull/3040 Signed-off-by: Markus Heiser --- searx/engines/tineye.py | 83 +++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +++ 2 files changed, 89 insertions(+) create mode 100644 searx/engines/tineye.py diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py new file mode 100644 index 000000000..e8e45fac4 --- /dev/null +++ b/searx/engines/tineye.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Tineye - Reverse search images + +""" + +from json import loads +from urllib.parse import urlencode +from datetime import datetime + +about = { + "website": 'https://tineye.com', + "wikidata_id": 'Q2382535', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] +paging = True +safesearch = False +base_url = 'https://tineye.com' +search_string = '/result_json/?page={page}&{query}' + + +def request(query, params): + # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py + params['url'] = base_url + search_string.format(query=urlencode({'url': query}), page=params['pageno']) + + params['headers'].update( + { + 'Connection': 'keep-alive', + 'Accept-Encoding': 'gzip, defalte, br', + 'Host': 'tineye.com', + 'DNT': '1', + 'TE': 'trailers', + } + ) + return params + + +def response(resp): + results = [] + + # Define wanted results + json_data = loads(resp.text) + number_of_results = json_data['num_matches'] + + for i in json_data['matches']: + image_format = i['format'] + width = i['width'] + height = i['height'] + thumbnail_src = i['image_url'] + backlink = i['domains'][0]['backlinks'][0] + url = backlink['backlink'] + source = backlink['url'] + title = backlink['image_name'] + img_src = backlink['url'] + + # Get and convert published date + api_date = backlink['crawl_date'][:-3] + publishedDate = datetime.fromisoformat(api_date) + + # Append results + results.append( + { + 'template': 'images.html', + 'url': url, + 'thumbnail_src': thumbnail_src, + 'source': source, + 'title': title, + 'img_src': img_src, + 'format': image_format, + 'widht': width, + 'height': height, + 'publishedDate': publishedDate, + } + ) + + # Append number of results + results.append({'number_of_results': number_of_results}) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index ad38d543d..60b8b8278 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -483,6 +483,12 @@ engines: timeout: 3.0 disabled: true + - name: tineye + engine: tineye + shortcut: tin + timeout: 9.0 + disabled: true + - name: etymonline engine: xpath paging: true From b7f74fbe42f54ebd60aeeed77312bcb4c4d63f76 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 25 Jan 2022 16:37:18 +0100 Subject: [PATCH 2/5] [mod] tineye - add some documentation Signed-off-by: Markus Heiser --- docs/src/searx.engines.tineye.rst | 9 +++++++++ searx/engines/tineye.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 docs/src/searx.engines.tineye.rst diff --git a/docs/src/searx.engines.tineye.rst b/docs/src/searx.engines.tineye.rst new file mode 100644 index 000000000..79e24cfb8 --- /dev/null +++ b/docs/src/searx.engines.tineye.rst @@ -0,0 +1,9 @@ +.. _tineye engine: + +====== +Tineye +====== + +.. automodule:: searx.engines.tineye + :members: + diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index e8e45fac4..302aea326 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -1,6 +1,17 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Tineye - Reverse search images +"""This engine implements *Tineye - reverse image search* + +Using TinEye, you can search by image or perform what we call a reverse image +search. You can do that by uploading an image or searching by URL. You can also +simply drag and drop your images to start your search. TinEye constantly crawls +the web and adds images to its index. Today, the TinEye index is over 50.2 +billion images `[tineye.com] `_. + +.. hint:: + + This SearXNG engine only supports *'searching by URL'* and it does not use + the official API `[api.tineye.com] `_. """ @@ -11,6 +22,7 @@ from datetime import datetime about = { "website": 'https://tineye.com', "wikidata_id": 'Q2382535', + "official_api_documentation": 'https://api.tineye.com/python/docs/', "use_official_api": False, "require_api_key": False, "results": 'JSON', From e92d40c854eb22f22baed9558257941057cf13fa Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 30 Jan 2022 16:05:08 +0100 Subject: [PATCH 3/5] [enh] implement a OnlineUrlSearchProcessor Signed-off-by: Markus Heiser --- searx/search/processors/__init__.py | 10 ++++- searx/search/processors/online_url_search.py | 42 ++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 searx/search/processors/online_url_search.py diff --git a/searx/search/processors/__init__.py b/searx/search/processors/__init__.py index 4e85527ba..a270b4ef5 100644 --- a/searx/search/processors/__init__.py +++ b/searx/search/processors/__init__.py @@ -11,6 +11,7 @@ __all__ = [ 'OnlineProcessor', 'OnlineDictionaryProcessor', 'OnlineCurrencyProcessor', + 'OnlineUrlSearchProcessor', 'PROCESSORS', ] @@ -24,6 +25,7 @@ from .online import OnlineProcessor from .offline import OfflineProcessor from .online_dictionary import OnlineDictionaryProcessor from .online_currency import OnlineCurrencyProcessor +from .online_url_search import OnlineUrlSearchProcessor from .abstract import EngineProcessor logger = logger.getChild('search.processors') @@ -33,7 +35,13 @@ PROCESSORS: Dict[str, EngineProcessor] = {} def get_processor_class(engine_type): """Return processor class according to the ``engine_type``""" - for c in [OnlineProcessor, OfflineProcessor, OnlineDictionaryProcessor, OnlineCurrencyProcessor]: + for c in [ + OnlineProcessor, + OfflineProcessor, + OnlineDictionaryProcessor, + OnlineCurrencyProcessor, + OnlineUrlSearchProcessor, + ]: if c.engine_type == engine_type: return c return None diff --git a/searx/search/processors/online_url_search.py b/searx/search/processors/online_url_search.py new file mode 100644 index 000000000..2863be28e --- /dev/null +++ b/searx/search/processors/online_url_search.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Processores for engine-type: ``online_url_search`` + +""" + +import re +from .online import OnlineProcessor + +re_search_urls = { + 'http': re.compile(r'https?:\/\/[^ ]*'), + 'ftp': re.compile(r'ftps?:\/\/[^ ]*'), + 'data:image': re.compile('data:image/[^; ]*;base64,[^ ]*'), +} + + +class OnlineUrlSearchProcessor(OnlineProcessor): + """Processor class used by ``online_url_search`` engines.""" + + engine_type = 'online_url_search' + + def get_params(self, search_query, engine_category): + params = super().get_params(search_query, engine_category) + if params is None: + return None + + url_match = False + search_urls = {} + + for k, v in re_search_urls.items(): + m = v.search(search_query.query) + v = None + if m: + url_match = True + v = m[0] + search_urls[k] = v + + if not url_match: + return None + + params['search_urls'] = search_urls + return params From a6b879f19c74cd0c15907ed9d21b9185ccea9d25 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 30 Jan 2022 16:30:52 +0100 Subject: [PATCH 4/5] [mod] tineye engine: set engine_type to 'online_url_search' Signed-off-by: Markus Heiser --- searx/engines/tineye.py | 13 +++++++++++-- searx/settings.yml | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index 302aea326..5a8c86062 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -28,7 +28,8 @@ about = { "results": 'JSON', } -categories = ['images'] +engine_type = 'online_url_search' +categories = ['general'] paging = True safesearch = False base_url = 'https://tineye.com' @@ -36,8 +37,16 @@ search_string = '/result_json/?page={page}&{query}' def request(query, params): + + if params['search_urls']['data:image']: + query = params['search_urls']['data:image'] + elif params['search_urls']['http']: + query = params['search_urls']['http'] + + query = urlencode({'url': query}) + # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py - params['url'] = base_url + search_string.format(query=urlencode({'url': query}), page=params['pageno']) + params['url'] = base_url + search_string.format(query=query, page=params['pageno']) params['headers'].update( { diff --git a/searx/settings.yml b/searx/settings.yml index 60b8b8278..ac320c457 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -487,7 +487,7 @@ engines: engine: tineye shortcut: tin timeout: 9.0 - disabled: true + disabled: false - name: etymonline engine: xpath From ebd3013a1aad1bc6def749dea07d6278f399fb69 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sun, 30 Jan 2022 20:44:30 +0100 Subject: [PATCH 5/5] [mod] tineye engine: minor changes * remove "disable: false" in settings.yml * use the json() method from httpx.Response (faster character encoding detection) --- searx/engines/tineye.py | 3 +-- searx/settings.yml | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index 5a8c86062..fe5b60393 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -15,7 +15,6 @@ billion images `[tineye.com] `_. """ -from json import loads from urllib.parse import urlencode from datetime import datetime @@ -64,7 +63,7 @@ def response(resp): results = [] # Define wanted results - json_data = loads(resp.text) + json_data = resp.json() number_of_results = json_data['num_matches'] for i in json_data['matches']: diff --git a/searx/settings.yml b/searx/settings.yml index ac320c457..be068a10e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -487,7 +487,6 @@ engines: engine: tineye shortcut: tin timeout: 9.0 - disabled: false - name: etymonline engine: xpath