Merge pull request #91 from return42/xpath-misc

[doc] add documentation about the XPath engine
pull/93/head
Markus Heiser 3 years ago committed by GitHub
commit 703f8c4a8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -43,7 +43,7 @@ argument type information
categories list pages, in which the engine is working categories list pages, in which the engine is working
paging boolean support multible pages paging boolean support multible pages
time_range_support boolean support search time range time_range_support boolean support search time range
engine_type str ``online`` by default, other possibles values are engine_type str ``online`` by default, other possibles values are
``offline``, ``online_dictionnary``, ``online_currency`` ``offline``, ``online_dictionnary``, ``online_currency``
======================= =========== ======================================================== ======================= =========== ========================================================
@ -100,6 +100,8 @@ example code
paging = True paging = True
.. _engine request:
making a request making a request
================ ================
@ -198,6 +200,8 @@ example code
return params return params
.. _engine results:
returned results returned results
================ ================

@ -9,6 +9,7 @@ Developer documentation
quickstart quickstart
contribution_guide contribution_guide
engine_overview engine_overview
xpath_engine
search_api search_api
plugins plugins
translation translation

@ -0,0 +1,9 @@
.. _xpath_engine:
================
The XPath engine
================
.. automodule:: searx.engines.xpath
:members:

@ -4,7 +4,8 @@ Welcome to searxng
*Search without being tracked.* *Search without being tracked.*
.. warning:: .. hint::
This is not searx, but searxng. This is not searx, but searxng.
Searxng is a free internet metasearch engine which aggregates results from more Searxng is a free internet metasearch engine which aggregates results from more

@ -1,51 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# pylint: disable=missing-function-docstring
"""The XPath engine is a *generic* engine with which it is possible to configure
engines in the settings.
Here is a simple example of a XPath engine configured in the
:ref:`settings engine` section, further read :ref:`engines-dev`.
.. code:: yaml
- name : bitbucket
engine : xpath
paging : True
search_url : https://bitbucket.org/repo/all/{pageno}?name={query}
url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href
title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]
content_xpath : //article[@class="repo-summary"]/p
"""
from lxml import html
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
from searx import logger
logger = logger.getChild('XPath engine')
search_url = None search_url = None
"""
Search URL of the engine, replacements are:
``{query}``:
Search terms from user.
``{pageno}``:
Page number if engine supports pagging :py:obj:`paging`
"""
soft_max_redirects = 0
'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
results_xpath = ''
'''XPath selector for the list of result items'''
url_xpath = None url_xpath = None
'''XPath selector of result's ``url``.'''
content_xpath = None content_xpath = None
'''XPath selector of result's ``content``.'''
title_xpath = None title_xpath = None
'''XPath selector of result's ``title``.'''
thumbnail_xpath = False thumbnail_xpath = False
paging = False '''XPath selector of result's ``img_src``.'''
suggestion_xpath = '' suggestion_xpath = ''
results_xpath = '' '''XPath selector of result's ``suggestion``.'''
cached_xpath = '' cached_xpath = ''
cached_url = '' cached_url = ''
soft_max_redirects = 0
# parameters for engines with paging support paging = False
# '''Engine supports paging [True or False].'''
# number of results on each page
# (only needed if the site requires not a page number, but an offset)
page_size = 1 page_size = 1
# number of the first page (usually 0 or 1) '''Number of results on each page. Only needed if the site requires not a page
first_page_num = 1 number, but an offset.'''
first_page_num = 1
'''Number of the first page (usually 0 or 1).'''
def request(query, params): def request(query, params):
'''Build request parameters (see :ref:`engine request`).
'''
query = urlencode({'q': query})[2:] query = urlencode({'q': query})[2:]
fp = {'query': query} fargs = {'query': query}
if paging and search_url.find('{pageno}') >= 0: if paging and search_url.find('{pageno}') >= 0:
fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
params['url'] = search_url.format(**fp) params['url'] = search_url.format(**fargs)
params['query'] = query params['query'] = query
params['soft_max_redirects'] = soft_max_redirects params['soft_max_redirects'] = soft_max_redirects
logger.debug("query_url --> %s", params['url'])
return params return params
def response(resp): def response(resp):
'''Scrap *results* from the response (see :ref:`engine results`).
'''
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
is_onion = True if 'onions' in categories else False # pylint: disable=undefined-variable is_onion = 'onions' in categories # pylint: disable=undefined-variable
if results_xpath: if results_xpath:
for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, results_xpath):
url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
title = extract_text(eval_xpath_list(result, title_xpath, min_len=1)) title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
content = extract_text(eval_xpath_list(result, content_xpath, min_len=1)) content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
@ -59,13 +114,16 @@ def response(resp):
# add alternative cached url if available # add alternative cached url if available
if cached_xpath: if cached_xpath:
tmp_result['cached_url'] = cached_url\ tmp_result['cached_url'] = (
cached_url
+ extract_text(eval_xpath_list(result, cached_xpath, min_len=1)) + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
)
if is_onion: if is_onion:
tmp_result['is_onion'] = True tmp_result['is_onion'] = True
results.append(tmp_result) results.append(tmp_result)
else: else:
if cached_xpath: if cached_xpath:
for url, title, content, cached in zip( for url, title, content, cached in zip(
@ -75,8 +133,12 @@ def response(resp):
map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)),
map(extract_text, eval_xpath_list(dom, cached_xpath)) map(extract_text, eval_xpath_list(dom, cached_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, results.append({
'cached_url': cached_url + cached, 'is_onion': is_onion}) 'url': url,
'title': title,
'content': content,
'cached_url': cached_url + cached, 'is_onion': is_onion
})
else: else:
for url, title, content in zip( for url, title, content in zip(
(extract_url(x, search_url) for (extract_url(x, search_url) for
@ -84,10 +146,16 @@ def response(resp):
map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)),
map(extract_text, eval_xpath_list(dom, content_xpath)) map(extract_text, eval_xpath_list(dom, content_xpath))
): ):
results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) results.append({
'url': url,
if not suggestion_xpath: 'title': title,
return results 'content': content,
for suggestion in eval_xpath(dom, suggestion_xpath): 'is_onion': is_onion
results.append({'suggestion': extract_text(suggestion)}) })
if suggestion_xpath:
for suggestion in eval_xpath(dom, suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)})
logger.debug("found %s results", len(results))
return results return results

Loading…
Cancel
Save