From d04e471ce53c5efd224a4ed0e7b5d88fb0d3a093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Wed, 21 Mar 2018 22:30:29 +0100 Subject: [PATCH 1/2] add findx engine for general, images and videos --- searx/engines/findx.py | 119 +++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 18 +++++++ 2 files changed, 137 insertions(+) create mode 100644 searx/engines/findx.py diff --git a/searx/engines/findx.py b/searx/engines/findx.py new file mode 100644 index 000000000..30a26402a --- /dev/null +++ b/searx/engines/findx.py @@ -0,0 +1,119 @@ +""" +FindX (General, Images, Videos) + +@website https://www.findx.com +@provide-api no +@using-api no +@results HTML +@stable no +@parse url, title, content, embedded, img_src, thumbnail_src +""" + +from dateutil import parser +from json import loads +import re + +from lxml import html + +from searx import logger +from searx.engines.xpath import extract_text +from searx.engines.youtube_noapi import base_youtube_url, embedded_url +from searx.url_utils import urlencode + + +paging = True +results_xpath = '//script[@id="initial-state"]' +search_url = 'https://www.findx.com/{category}?{q}' +type_map = { + 'none': 'web', + 'general': 'web', + 'images': 'images', + 'videos': 'videos', +} + + +def request(query, params): + category = 'general' + if 'category' in params and len(params['category']) == 1: + category = params['category'][0] + + params['url'] = search_url.format( + category=type_map[category], + q=urlencode({ + 'q': query, + 'page': params['pageno'] + }) + ) + return params + + +def response(resp): + dom = html.fromstring(resp.text) + results_raw_json = dom.xpath(results_xpath) + results_json = loads(extract_text(results_raw_json)) + + if len(results_json['web']['results']) > 0: + return _general_results(results_json['web']['results']) + + if len(results_json['images']['results']) > 0: + return _images_results(results_json['images']['results']) + + if len(results_json['video']['results']) > 0: + return _videos_results(results_json['video']['results']) + + return [] + + +def _general_results(general_results): + results = [] + for result in general_results: + results.append({ + 'url': result['url'], + 'title': result['title'], + 'content': result['sum'], + }) + return results + + +def _images_results(image_results): + results = [] + for result in image_results: + results.append({ + 'url': result['sourceURL'], + 'title': result['title'], + 'content': result['source'], + 'thumbnail_src': _extract_url(result['assets']['thumb']['url']), + 'img_src': _extract_url(result['assets']['file']['url']), + 'template': 'images.html', + }) + return results + + +def _videos_results(video_results): + results = [] + for result in video_results: + if not result['kind'].startswith('youtube'): + logger.warn('Unknown video kind in findx: {}'.format(result['kind'])) + continue + + description = result['snippet']['description'] + if len(description) > 300: + description = description[:300] + '...' + + results.append({ + 'url': base_youtube_url + result['id'], + 'title': result['snippet']['title'], + 'content': description, + 'thumbnail': _extract_url(result['snippet']['thumbnails']['default']['url']), + 'publishedDate': parser.parse(result['snippet']['publishedAt']), + 'embedded': embedded_url.format(videoid=result['id']), + 'template': 'videos.html', + }) + return results + + +def _extract_url(url): + matching = re.search('(/https?://[^)]+)', url) + if matching: + return matching.group(0)[1:] + return '' diff --git a/searx/settings.yml b/searx/settings.yml index 73212f2e2..70750fc96 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -218,6 +218,24 @@ engines: shortcut : fd disabled : True + - name : findx + engine : findx + shortcut : fx + categories : general + disabled : True + + - name : findx images + engine : findx + shortcut : fxi + categories : images + disabled : True + + - name : findx videos + engine : findx + shortcut : fxv + categories : videos + disabled : True + - name : flickr categories : images shortcut : fl From 8cc529e9a3976e48676676600379ce43f690dd8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Thu, 22 Mar 2018 11:02:24 +0100 Subject: [PATCH 2/2] forward category to engine without highlighting on the ui --- searx/engines/findx.py | 6 +----- searx/query.py | 15 +++++++++++---- searx/search.py | 9 +++++++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/searx/engines/findx.py b/searx/engines/findx.py index 30a26402a..db4a1aa5f 100644 --- a/searx/engines/findx.py +++ b/searx/engines/findx.py @@ -33,12 +33,8 @@ type_map = { def request(query, params): - category = 'general' - if 'category' in params and len(params['category']) == 1: - category = params['category'][0] - params['url'] = search_url.format( - category=type_map[category], + category=type_map[params['category']], q=urlencode({ 'q': query, 'page': params['pageno'] diff --git a/searx/query.py b/searx/query.py index 6e5f2e883..f7543e3e1 100644 --- a/searx/query.py +++ b/searx/query.py @@ -107,14 +107,21 @@ class RawTextQuery(object): # check if prefix is equal with engine shortcut if prefix in engine_shortcuts: parse_next = True - self.engines.append({'category': 'none', - 'name': engine_shortcuts[prefix]}) + engine_name = engine_shortcuts[prefix] + if engine_name in engines: + for engine_category in engines[engine_name].categories: + self.engines.append({'category': engine_category, + 'name': engine_name, + 'from_bang': True}) # check if prefix is equal with engine name elif prefix in engines: parse_next = True - self.engines.append({'category': 'none', - 'name': prefix}) + if prefix in engines: + for engine_category in engines[engine_name].categories: + self.engines.append({'category': engine_category, + 'name': engine_name, + 'from_bang': True}) # check if prefix is equal with categorie name elif prefix in categories: diff --git a/searx/search.py b/searx/search.py index b523c2754..945f32197 100644 --- a/searx/search.py +++ b/searx/search.py @@ -258,8 +258,13 @@ def get_search_query_from_webapp(preferences, form): # if engines are calculated from query, # set categories by using that informations if query_engines and raw_text_query.specific: - query_categories = list(set(engine['category'] - for engine in query_engines)) + additional_categories = set() + for engine in query_engines: + if 'from_bang' in engine and engine['from_bang']: + additional_categories.add('none') + else: + additional_categories.add(engine['category']) + query_categories = list(additional_categories) # otherwise, using defined categories to # calculate which engines should be used