forked from Archives/searxng
[enh] Add onions category with Ahmia, Not Evil and Torch
Xpath engine and results template changed to account for the fact that archive.org doesn't cache .onions, though some onion engines migth have their own cache. Disabled by default. Can be enabled by setting the SOCKS proxies to wherever Tor is listening and setting using_tor_proxy as True. Requires Tor and updating packages. To avoid manually adding the timeout on each engine, you can set extra_proxy_timeout to account for Tor's (or whatever proxy used) extra time.dependabot/pip/master/sphinx-6.1.3
parent
0a44fa8bb7
commit
c3daa08537
@ -0,0 +1,82 @@
|
||||
"""
|
||||
Ahmia (Onions)
|
||||
|
||||
@website http://msydqstlz2kzerdg.onion
|
||||
@provides-api no
|
||||
|
||||
@using-api no
|
||||
@results HTML
|
||||
@stable no
|
||||
@parse url, title, content
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode, urlparse, parse_qs
|
||||
from lxml.html import fromstring
|
||||
from searx.engines.xpath import extract_url, extract_text
|
||||
|
||||
# engine config
|
||||
categories = ['onions']
|
||||
paging = True
|
||||
page_size = 10
|
||||
|
||||
# search url
|
||||
search_url = 'http://msydqstlz2kzerdg.onion/search/?{query}'
|
||||
time_range_support = True
|
||||
time_range_dict = {'day': 1,
|
||||
'week': 7,
|
||||
'month': 30}
|
||||
|
||||
# xpaths
|
||||
results_xpath = '//li[@class="result"]'
|
||||
url_xpath = './h4/a/@href'
|
||||
title_xpath = './h4/a[1]'
|
||||
content_xpath = './/p[1]'
|
||||
correction_xpath = '//*[@id="didYouMean"]//a'
|
||||
number_of_results_xpath = '//*[@id="totalResults"]'
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(query=urlencode({'q': query}))
|
||||
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
dom = fromstring(resp.text)
|
||||
|
||||
# trim results so there's not way too many at once
|
||||
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
|
||||
all_results = dom.xpath(results_xpath)
|
||||
trimmed_results = all_results[first_result_index:first_result_index + page_size]
|
||||
|
||||
# get results
|
||||
for result in trimmed_results:
|
||||
# remove ahmia url and extract the actual url for the result
|
||||
raw_url = extract_url(result.xpath(url_xpath), search_url)
|
||||
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
|
||||
|
||||
title = extract_text(result.xpath(title_xpath))
|
||||
content = extract_text(result.xpath(content_xpath))
|
||||
|
||||
results.append({'url': cleaned_url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'is_onion': True})
|
||||
|
||||
# get spelling corrections
|
||||
for correction in dom.xpath(correction_xpath):
|
||||
results.append({'correction': extract_text(correction)})
|
||||
|
||||
# get number of results
|
||||
number_of_results = dom.xpath(number_of_results_xpath)
|
||||
if number_of_results:
|
||||
try:
|
||||
results.append({'number_of_results': int(extract_text(number_of_results))})
|
||||
except:
|
||||
pass
|
||||
|
||||
return results
|
@ -0,0 +1,64 @@
|
||||
"""
|
||||
not Evil (Onions)
|
||||
|
||||
@website http://hss3uro2hsxfogfq.onion
|
||||
@provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm)
|
||||
|
||||
@using-api no
|
||||
@results HTML
|
||||
@stable no
|
||||
@parse url, title, content
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.engines.xpath import extract_text
|
||||
|
||||
# engine dependent config
|
||||
categories = ['onions']
|
||||
paging = True
|
||||
page_size = 20
|
||||
|
||||
# search-url
|
||||
base_url = 'http://hss3uro2hsxfogfq.onion/'
|
||||
search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//*[@id="content"]/div/p'
|
||||
url_xpath = './span[1]'
|
||||
title_xpath = './a[1]'
|
||||
content_xpath = './text()'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
offset = (params['pageno'] - 1) * page_size
|
||||
|
||||
params['url'] = base_url + search_url.format(pageno=offset,
|
||||
query=urlencode({'q': query}),
|
||||
page_size=page_size)
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
# needed because otherwise requests guesses wrong encoding
|
||||
resp.encoding = 'utf8'
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
url = extract_text(result.xpath(url_xpath)[0])
|
||||
title = extract_text(result.xpath(title_xpath)[0])
|
||||
content = extract_text(result.xpath(content_xpath))
|
||||
|
||||
# append result
|
||||
results.append({'url': url,
|
||||
'title': title,
|
||||
'content': content,
|
||||
'is_onion': True})
|
||||
|
||||
return results
|
@ -1,6 +1,11 @@
|
||||
<div class="result {{ result.class }}{% for e in result.engines %} {{ e }}{% endfor %}">
|
||||
<h3 class="result_title">{% if "icon_"~result.engine~".ico" in favicons %}<img width="14" height="14" class="favicon" src="{{ url_for('static', filename='img/icons/icon_'+result.engine+'.ico') }}" alt="{{result.engine}}" />{% endif %}<a href="{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ result.title|safe }}</a></h3>
|
||||
<p class="url">{{ result.pretty_url }}‎ <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
|
||||
<p class="url">{{ result.pretty_url }}‎
|
||||
{% if result.cached_url %}
|
||||
<a class="cache_link" href="{{ result.cached_url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
|
||||
{% elif not result.is_onion %}
|
||||
<a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
|
||||
{% endif %}
|
||||
{% if result.publishedDate %}<span class="published_date">{{ result.publishedDate }}</span>{% endif %}</p>
|
||||
<p class="content">{% if result.img_src %}<img src="{{ image_proxify(result.img_src) }}" class="image" />{% endif %}{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
|
||||
</div>
|
||||
|
@ -0,0 +1,121 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import defaultdict
|
||||
import mock
|
||||
from searx.engines import xpath
|
||||
from searx.testing import SearxTestCase
|
||||
|
||||
|
||||
class TestXpathEngine(SearxTestCase):
|
||||
|
||||
def test_request(self):
|
||||
xpath.search_url = 'https://url.com/{query}'
|
||||
xpath.categories = []
|
||||
xpath.paging = False
|
||||
query = 'test_query'
|
||||
dicto = defaultdict(dict)
|
||||
params = xpath.request(query, dicto)
|
||||
self.assertIn('url', params)
|
||||
self.assertEquals('https://url.com/test_query', params['url'])
|
||||
|
||||
xpath.search_url = 'https://url.com/q={query}&p={pageno}'
|
||||
xpath.paging = True
|
||||
query = 'test_query'
|
||||
dicto = defaultdict(dict)
|
||||
dicto['pageno'] = 1
|
||||
params = xpath.request(query, dicto)
|
||||
self.assertIn('url', params)
|
||||
self.assertEquals('https://url.com/q=test_query&p=1', params['url'])
|
||||
|
||||
def test_response(self):
|
||||
# without results_xpath
|
||||
xpath.url_xpath = '//div[@class="search_result"]//a[@class="result"]/@href'
|
||||
xpath.title_xpath = '//div[@class="search_result"]//a[@class="result"]'
|
||||
xpath.content_xpath = '//div[@class="search_result"]//p[@class="content"]'
|
||||
|
||||
self.assertRaises(AttributeError, xpath.response, None)
|
||||
self.assertRaises(AttributeError, xpath.response, [])
|
||||
self.assertRaises(AttributeError, xpath.response, '')
|
||||
self.assertRaises(AttributeError, xpath.response, '[]')
|
||||
|
||||
response = mock.Mock(text='<html></html>')
|
||||
self.assertEqual(xpath.response(response), [])
|
||||
|
||||
html = u"""
|
||||
<div>
|
||||
<div class="search_result">
|
||||
<a class="result" href="https://result1.com">Result 1</a>
|
||||
<p class="content">Content 1</p>
|
||||
<a class="cached" href="https://cachedresult1.com">Cache</a>
|
||||
</div>
|
||||
<div class="search_result">
|
||||
<a class="result" href="https://result2.com">Result 2</a>
|
||||
<p class="content">Content 2</p>
|
||||
<a class="cached" href="https://cachedresult2.com">Cache</a>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
response = mock.Mock(text=html)
|
||||
results = xpath.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['title'], 'Result 1')
|
||||
self.assertEqual(results[0]['url'], 'https://result1.com/')
|
||||
self.assertEqual(results[0]['content'], 'Content 1')
|
||||
self.assertEqual(results[1]['title'], 'Result 2')
|
||||
self.assertEqual(results[1]['url'], 'https://result2.com/')
|
||||
self.assertEqual(results[1]['content'], 'Content 2')
|
||||
|
||||
# with cached urls, without results_xpath
|
||||
xpath.cached_xpath = '//div[@class="search_result"]//a[@class="cached"]/@href'
|
||||
results = xpath.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
|
||||
self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
|
||||
self.assertFalse(results[0].get('is_onion', False))
|
||||
|
||||
# results are onion urls (no results_xpath)
|
||||
xpath.categories = ['onions']
|
||||
results = xpath.response(response)
|
||||
self.assertTrue(results[0]['is_onion'])
|
||||
|
||||
# with results_xpath
|
||||
xpath.results_xpath = '//div[@class="search_result"]'
|
||||
xpath.url_xpath = './/a[@class="result"]/@href'
|
||||
xpath.title_xpath = './/a[@class="result"]'
|
||||
xpath.content_xpath = './/p[@class="content"]'
|
||||
xpath.cached_xpath = None
|
||||
xpath.categories = []
|
||||
|
||||
self.assertRaises(AttributeError, xpath.response, None)
|
||||
self.assertRaises(AttributeError, xpath.response, [])
|
||||
self.assertRaises(AttributeError, xpath.response, '')
|
||||
self.assertRaises(AttributeError, xpath.response, '[]')
|
||||
|
||||
response = mock.Mock(text='<html></html>')
|
||||
self.assertEqual(xpath.response(response), [])
|
||||
|
||||
response = mock.Mock(text=html)
|
||||
results = xpath.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['title'], 'Result 1')
|
||||
self.assertEqual(results[0]['url'], 'https://result1.com/')
|
||||
self.assertEqual(results[0]['content'], 'Content 1')
|
||||
self.assertEqual(results[1]['title'], 'Result 2')
|
||||
self.assertEqual(results[1]['url'], 'https://result2.com/')
|
||||
self.assertEqual(results[1]['content'], 'Content 2')
|
||||
|
||||
# with cached urls, with results_xpath
|
||||
xpath.cached_xpath = './/a[@class="cached"]/@href'
|
||||
results = xpath.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
|
||||
self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
|
||||
self.assertFalse(results[0].get('is_onion', False))
|
||||
|
||||
# results are onion urls (with results_xpath)
|
||||
xpath.categories = ['onions']
|
||||
results = xpath.response(response)
|
||||
self.assertTrue(results[0]['is_onion'])
|
@ -0,0 +1,44 @@
|
||||
from searx.testing import SearxTestCase
|
||||
from searx import settings, engines
|
||||
|
||||
|
||||
class TestEnginesInit(SearxTestCase):
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
settings['outgoing']['using_tor_proxy'] = False
|
||||
settings['outgoing']['extra_proxy_timeout'] = 0
|
||||
|
||||
def test_initialize_engines_default(self):
|
||||
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'},
|
||||
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}]
|
||||
|
||||
engines.initialize_engines(engine_list)
|
||||
self.assertEqual(len(engines.engines), 2)
|
||||
self.assertIn('engine1', engines.engines)
|
||||
self.assertIn('engine2', engines.engines)
|
||||
|
||||
def test_initialize_engines_exclude_onions(self):
|
||||
settings['outgoing']['using_tor_proxy'] = False
|
||||
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'},
|
||||
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
|
||||
|
||||
engines.initialize_engines(engine_list)
|
||||
self.assertEqual(len(engines.engines), 1)
|
||||
self.assertIn('engine1', engines.engines)
|
||||
self.assertNotIn('onions', engines.categories)
|
||||
|
||||
def test_initialize_engines_include_onions(self):
|
||||
settings['outgoing']['using_tor_proxy'] = True
|
||||
settings['outgoing']['extra_proxy_timeout'] = 100.0
|
||||
engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general',
|
||||
'timeout': 20.0, 'onion_url': 'http://engine1.onion'},
|
||||
{'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
|
||||
|
||||
engines.initialize_engines(engine_list)
|
||||
self.assertEqual(len(engines.engines), 2)
|
||||
self.assertIn('engine1', engines.engines)
|
||||
self.assertIn('engine2', engines.engines)
|
||||
self.assertIn('onions', engines.categories)
|
||||
self.assertIn('http://engine1.onion', engines.engines['engine1'].search_url)
|
||||
self.assertEqual(engines.engines['engine1'].timeout, 120.0)
|
Loading…
Reference in New Issue