From a1e8af0796d532d529eb9d90f315f79dfbd86b0d Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 21 May 2022 18:24:47 +0200 Subject: [PATCH] bing.py: resolve bing.com/ck/a redirections add a new function searx.network.multi_requests to send multiple HTTP requests at once --- searx/engines/bing.py | 49 +++++++---- searx/network/__init__.py | 128 ++++++++++++++++++++--------- searx/network/network.py | 25 +++++- tests/unit/network/test_network.py | 14 ++-- 4 files changed, 156 insertions(+), 60 deletions(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 4c037de8..3d4ac08b 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -8,7 +8,8 @@ import re from urllib.parse import urlencode, urlparse, parse_qs from lxml import html -from searx.utils import eval_xpath, extract_text, match_language +from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language +from searx.network import multi_requests, Request about = { "website": 'https://www.bing.com', @@ -79,30 +80,48 @@ def response(resp): dom = html.fromstring(resp.text) - for result in eval_xpath(dom, '//div[@class="sa_cc"]'): - - # IMO //div[@class="sa_cc"] does no longer match - logger.debug('found //div[@class="sa_cc"] --> %s', result) - - link = eval_xpath(result, './/h3/a')[0] - url = link.attrib.get('href') - title = extract_text(link) - content = extract_text(eval_xpath(result, './/p')) - - # append result - results.append({'url': url, 'title': title, 'content': content}) - # parse results again if nothing is found yet - for result in eval_xpath(dom, '//li[@class="b_algo"]'): + + url_to_resolve = [] + url_to_resolve_index = [] + for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): link = eval_xpath(result, './/h2/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(eval_xpath(result, './/p')) + # get the real URL either using the URL shown to user or following the Bing URL + if url.startswith('https://www.bing.com/ck/a?'): + url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) + # Bing can shorten the URL either at the end or in the middle of the string + if ( + url_cite.startswith('https://') + and '…' not in url_cite + and '...' not in url_cite + and '›' not in url_cite + ): + # no need for an additional HTTP request + url = url_cite + else: + # resolve the URL with an additional HTTP request + url_to_resolve.append(url.replace('&ntb=1', '&ntb=F')) + url_to_resolve_index.append(i) + url = None # remove the result if the HTTP Bing redirect raise an exception + # append result results.append({'url': url, 'title': title, 'content': content}) + # resolve all Bing redirections in parallel + request_list = [ + Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve + ] + response_list = multi_requests(request_list) + for i, redirect_response in enumerate(response_list): + if not isinstance(redirect_response, Exception): + results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] + + # get number_of_results try: result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: diff --git a/searx/network/__init__.py b/searx/network/__init__.py index 06c9f75a..8622e973 100644 --- a/searx/network/__init__.py +++ b/searx/network/__init__.py @@ -8,7 +8,8 @@ import concurrent.futures from queue import SimpleQueue from types import MethodType from timeit import default_timer -from typing import Iterable, Tuple +from typing import Iterable, NamedTuple, Tuple, List, Dict, Union +from contextlib import contextmanager import httpx import anyio @@ -48,9 +49,23 @@ def get_context_network(): return THREADLOCAL.__dict__.get('network') or get_network() -def request(method, url, **kwargs): - """same as requests/requests/api.py request(...)""" +@contextmanager +def _record_http_time(): + # pylint: disable=too-many-branches time_before_request = default_timer() + start_time = getattr(THREADLOCAL, 'start_time', time_before_request) + try: + yield start_time + finally: + # update total_time. + # See get_time_for_thread() and reset_time_for_thread() + if hasattr(THREADLOCAL, 'total_time'): + time_after_request = default_timer() + THREADLOCAL.total_time += time_after_request - time_before_request + + +def _get_timeout(start_time, kwargs): + # pylint: disable=too-many-branches # timeout (httpx) if 'timeout' in kwargs: @@ -65,45 +80,84 @@ def request(method, url, **kwargs): # ajdust actual timeout timeout += 0.2 # overhead - start_time = getattr(THREADLOCAL, 'start_time', time_before_request) if start_time: timeout -= default_timer() - start_time - # raise_for_error - check_for_httperror = True - if 'raise_for_httperror' in kwargs: - check_for_httperror = kwargs['raise_for_httperror'] - del kwargs['raise_for_httperror'] + return timeout - # requests compatibility - if isinstance(url, bytes): - url = url.decode() - # network - network = get_context_network() - - # do request - future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) - try: - response = future.result(timeout) - except concurrent.futures.TimeoutError as e: - raise httpx.TimeoutException('Timeout', request=None) from e - - # requests compatibility - # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses - response.ok = not response.is_error - - # update total_time. - # See get_time_for_thread() and reset_time_for_thread() - if hasattr(THREADLOCAL, 'total_time'): - time_after_request = default_timer() - THREADLOCAL.total_time += time_after_request - time_before_request - - # raise an exception - if check_for_httperror: - raise_for_httperror(response) - - return response +def request(method, url, **kwargs): + """same as requests/requests/api.py request(...)""" + with _record_http_time() as start_time: + network = get_context_network() + timeout = _get_timeout(start_time, kwargs) + future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) + try: + return future.result(timeout) + except concurrent.futures.TimeoutError as e: + raise httpx.TimeoutException('Timeout', request=None) from e + + +def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]: + """send multiple HTTP requests in parallel. Wait for all requests to finish.""" + with _record_http_time() as start_time: + # send the requests + network = get_context_network() + loop = get_loop() + future_list = [] + for request_desc in request_list: + timeout = _get_timeout(start_time, request_desc.kwargs) + future = asyncio.run_coroutine_threadsafe( + network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop + ) + future_list.append((future, timeout)) + + # read the responses + responses = [] + for future, timeout in future_list: + try: + responses.append(future.result(timeout)) + except concurrent.futures.TimeoutError: + responses.append(httpx.TimeoutException('Timeout', request=None)) + except Exception as e: # pylint: disable=broad-except + responses.append(e) + return responses + + +class Request(NamedTuple): + """Request description for the multi_requests function""" + + method: str + url: str + kwargs: Dict[str, str] = {} + + @staticmethod + def get(url, **kwargs): + return Request('GET', url, kwargs) + + @staticmethod + def options(url, **kwargs): + return Request('OPTIONS', url, kwargs) + + @staticmethod + def head(url, **kwargs): + return Request('HEAD', url, kwargs) + + @staticmethod + def post(url, **kwargs): + return Request('POST', url, kwargs) + + @staticmethod + def put(url, **kwargs): + return Request('PUT', url, kwargs) + + @staticmethod + def patch(url, **kwargs): + return Request('PATCH', url, kwargs) + + @staticmethod + def delete(url, **kwargs): + return Request('DELETE', url, kwargs) def get(url, **kwargs): diff --git a/searx/network/network.py b/searx/network/network.py index 69af3b7c..677a908b 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -13,6 +13,7 @@ import httpx from searx import logger, searx_debug from .client import new_client, get_loop, AsyncHTTPTransportNoHttp +from .raise_for_httperror import raise_for_httperror logger = logger.getChild('network') @@ -226,6 +227,27 @@ class Network: kwargs['follow_redirects'] = kwargs.pop('allow_redirects') return kwargs_clients + @staticmethod + def extract_do_raise_for_httperror(kwargs): + do_raise_for_httperror = True + if 'raise_for_httperror' in kwargs: + do_raise_for_httperror = kwargs['raise_for_httperror'] + del kwargs['raise_for_httperror'] + return do_raise_for_httperror + + @staticmethod + def patch_response(response, do_raise_for_httperror): + if isinstance(response, httpx.Response): + # requests compatibility (response is not streamed) + # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses + response.ok = not response.is_error + + # raise an exception + if do_raise_for_httperror: + raise_for_httperror(response) + + return response + def is_valid_response(self, response): # pylint: disable=too-many-boolean-expressions if ( @@ -239,6 +261,7 @@ class Network: async def call_client(self, stream, method, url, **kwargs): retries = self.retries was_disconnected = False + do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs) kwargs_clients = Network.extract_kwargs_clients(kwargs) while retries >= 0: # pragma: no cover client = await self.get_client(**kwargs_clients) @@ -248,7 +271,7 @@ class Network: else: response = await client.request(method, url, **kwargs) if self.is_valid_response(response) or retries <= 0: - return response + return Network.patch_response(response, do_raise_for_httperror) except httpx.RemoteProtocolError as e: if not was_disconnected: # the server has closed the connection: diff --git a/tests/unit/network/test_network.py b/tests/unit/network/test_network.py index 4253e69a..905b981c 100644 --- a/tests/unit/network/test_network.py +++ b/tests/unit/network/test_network.py @@ -141,28 +141,28 @@ class TestNetworkRequestRetries(SearxTestCase): async def test_retries_ok(self): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): network = Network(enable_http=True, retries=1, retry_on_http_error=403) - response = await network.request('GET', 'https://example.com/') + response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) await network.aclose() async def test_retries_fail_int(self): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): network = Network(enable_http=True, retries=0, retry_on_http_error=403) - response = await network.request('GET', 'https://example.com/') + response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) self.assertEqual(response.status_code, 403) await network.aclose() async def test_retries_fail_list(self): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429]) - response = await network.request('GET', 'https://example.com/') + response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) self.assertEqual(response.status_code, 403) await network.aclose() async def test_retries_fail_bool(self): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): network = Network(enable_http=True, retries=0, retry_on_http_error=True) - response = await network.request('GET', 'https://example.com/') + response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) self.assertEqual(response.status_code, 403) await network.aclose() @@ -178,7 +178,7 @@ class TestNetworkRequestRetries(SearxTestCase): with patch.object(httpx.AsyncClient, 'request', new=get_response): network = Network(enable_http=True, retries=2) - response = await network.request('GET', 'https://example.com/') + response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) self.assertEqual(response.status_code, 200) self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) await network.aclose() @@ -190,7 +190,7 @@ class TestNetworkRequestRetries(SearxTestCase): with patch.object(httpx.AsyncClient, 'request', new=get_response): network = Network(enable_http=True, retries=0) with self.assertRaises(httpx.RequestError): - await network.request('GET', 'https://example.com/') + await network.request('GET', 'https://example.com/', raise_for_httperror=False) await network.aclose() @@ -237,6 +237,6 @@ class TestNetworkStreamRetries(SearxTestCase): with patch.object(httpx.AsyncClient, 'stream', new=stream): network = Network(enable_http=True, retries=0, retry_on_http_error=403) - response = await network.stream('GET', 'https://example.com/') + response = await network.stream('GET', 'https://example.com/', raise_for_httperror=False) self.assertEqual(response.status_code, 403) await network.aclose()