searxng/searx/engines/brave.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
news, videos, images).  The support of :py:obj:`paging` and :py:obj:`time range
<time_range_support>` is limited (see remarks).

Configured ``brave`` engines:

.. code:: yaml

  - name: brave
    engine: brave
    ...
    brave_category: search
    time_range_support: true
    paging: true

  - name: brave.images
    engine: brave
    ...
    brave_category: images

  - name: brave.videos
    engine: brave
    ...
    brave_category: videos

  - name: brave.news
    engine: brave
    ...
    brave_category: news


Implementations
===============

"""
# pylint: disable=fixme

from urllib.parse import (
    urlencode,
    urlparse,
    parse_qs,
)

import chompjs
from lxml import html

from searx.utils import (
    extract_text,
    eval_xpath_list,
    eval_xpath_getindex,
)

about = {
    "website": 'https://search.brave.com/',
    "wikidata_id": 'Q22906900',
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

base_url = "https://search.brave.com/"
categories = []
brave_category = 'search'
"""Brave supports common web-search, video search, image and video search.

- ``search``: Common WEB search
- ``videos``: search for videos
- ``images``: search for images
- ``news``: search for news
"""

brave_spellcheck = False
"""Brave supports some kind of spell checking.  When activated, Brave tries to
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``.  In
the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""

send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All)."""

safesearch = True
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off

time_range_support = False
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
category All)."""

time_range_map = {
    'day': 'pd',
    'week': 'pw',
    'month': 'pm',
    'year': 'py',
}


def request(query, params):

    # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
    params['headers']['Accept-Encoding'] = 'gzip, deflate'

    args = {
        'q': query,
    }
    if brave_spellcheck:
        args['spellcheck'] = '1'

    if brave_category == 'search':
        if params.get('pageno', 1) - 1:
            args['offset'] = params.get('pageno', 1) - 1
        if time_range_map.get(params['time_range']):
            args['tf'] = time_range_map.get(params['time_range'])

    params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"

    # set preferences in cookie
    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')

    # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
    #       limited and the country handling has it quirks

    eng_locale = params.get('searxng_locale')
    params['cookies']['useLocation'] = '0'  # the useLocation is IP based, we use 'country'
    params['cookies']['summarizer'] = '0'

    if not eng_locale or eng_locale == 'all':
        params['cookies']['country'] = 'all'  # country=all
    else:
        params['cookies']['country'] = eng_locale.split('-')[-1].lower()
        params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()

    # logger.debug("cookies %s", params['cookies'])


def response(resp):

    if brave_category == 'search':
        return _parse_search(resp)

    datastr = ""
    for line in resp.text.split("\n"):
        if "const data = " in line:
            datastr = line.replace("const data = ", "").strip()[:-1]
            break

    json_data = chompjs.parse_js_object(datastr)
    json_resp = json_data[1]['data']['body']['response']

    if brave_category == 'news':
        json_resp = json_resp['news']
        return _parse_news(json_resp)

    if brave_category == 'images':
        return _parse_images(json_resp)
    if brave_category == 'videos':
        return _parse_videos(json_resp)

    return []


def _parse_search(resp):

    result_list = []
    dom = html.fromstring(resp.text)

    answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
    if answer_tag:
        result_list.append({'answer': extract_text(answer_tag)})

    # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
    xpath_results = '//div[contains(@class, "snippet")]'

    for result in eval_xpath_list(dom, xpath_results):

        url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
        title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
        if not (url and title_tag):
            continue

        content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
        img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')

        item = {
            'url': url,
            'title': extract_text(title_tag),
            'content': extract_text(content_tag),
            'img_src': img_src,
        }

        video_tag = eval_xpath_getindex(
            result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
        )
        if video_tag:

            # In my tests a video tag in the WEB search was mostoften not a
            # video, except the ones from youtube ..

            iframe_src = _get_iframe_src(url)
            if iframe_src:
                item['iframe_src'] = iframe_src
                item['template'] = 'videos.html'
                item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
            else:
                item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')

        result_list.append(item)

    return result_list


def _get_iframe_src(url):
    parsed_url = urlparse(url)
    if parsed_url.path == '/watch' and parsed_url.query:
        video_id = parse_qs(parsed_url.query).get('v', [])  # type: ignore
        if video_id:
            return 'https://www.youtube-nocookie.com/embed/' + video_id[0]  # type: ignore
    return None


def _parse_news(json_resp):
    result_list = []

    for result in json_resp["results"]:
        item = {
            'url': result['url'],
            'title': result['title'],
            'content': result['description'],
        }
        if result['thumbnail'] != "null":
            item['img_src'] = result['thumbnail']['src']
        result_list.append(item)

    return result_list


def _parse_images(json_resp):
    result_list = []

    for result in json_resp["results"]:
        item = {
            'url': result['url'],
            'title': result['title'],
            'content': result['description'],
            'template': 'images.html',
            'img_format': result['properties']['format'],
            'source': result['source'],
            'img_src': result['properties']['url'],
        }
        result_list.append(item)

    return result_list


def _parse_videos(json_resp):
    result_list = []

    for result in json_resp["results"]:

        url = result['url']
        item = {
            'url': url,
            'title': result['title'],
            'content': result['description'],
            'template': 'videos.html',
            'length': result['video']['duration'],
            'duration': result['video']['duration'],
        }

        if result['thumbnail'] != "null":
            item['thumbnail'] = result['thumbnail']['src']

        iframe_src = _get_iframe_src(url)
        if iframe_src:
            item['iframe_src'] = iframe_src

        result_list.append(item)

    return result_list
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`# lint: pylint`
			"""Brave supports the categories listed in :py:obj:`brave_category` (General,
			news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
			<time_range_support>` is limited (see remarks).

			Configured ``brave`` engines:

			`.. code:: yaml`

			`- name: brave`
			`engine: brave`
			`...`
			`brave_category: search`
			`time_range_support: true`
			`paging: true`

			`- name: brave.images`
			`engine: brave`
			`...`
			`brave_category: images`

			`- name: brave.videos`
			`engine: brave`
			`...`
			`brave_category: videos`

			`- name: brave.news`
			`engine: brave`
			`...`
			`brave_category: news`


			`Implementations`
			`===============`

[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00			`"""`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`# pylint: disable=fixme`

			`from urllib.parse import (`
			`urlencode,`
			`urlparse,`
			`parse_qs,`
			`)`
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00			`import chompjs`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`from lxml import html`

			`from searx.utils import (`
			`extract_text,`
			`eval_xpath_list,`
			`eval_xpath_getindex,`
			`)`
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00
			`about = {`
			`"website": 'https://search.brave.com/',`
			`"wikidata_id": 'Q22906900',`
			`"official_api_documentation": None,`
			`"use_official_api": False,`
			`"require_api_key": False,`
			`"results": 'HTML',`
			`}`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00			`base_url = "https://search.brave.com/"`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`categories = []`
			`brave_category = 'search'`
			`"""Brave supports common web-search, video search, image and video search.`

			- ``search``: Common WEB search
			- ``videos``: search for videos
			- ``images``: search for images
			- ``news``: search for news
			`"""`

			`brave_spellcheck = False`
			`"""Brave supports some kind of spell checking. When activated, Brave tries to`
			fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
			`the UI of Brave the user gets warned about this, since we can not warn the user`
			`in SearXNG, the spellchecking is disabled by default.`
			`"""`

			`send_accept_language_header = True`
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00			`paging = False`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
			`category All)."""`

			`safesearch = True`
			`safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off`

			`time_range_support = False`
			"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
			`category All)."""`

			`time_range_map = {`
			`'day': 'pd',`
			`'week': 'pw',`
			`'month': 'pm',`
			`'year': 'py',`
			`}`
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00
			`def request(query, params):`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00
			`# Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787`
			`params['headers']['Accept-Encoding'] = 'gzip, deflate'`

[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00			`args = {`
			`'q': query,`
			`}`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`if brave_spellcheck:`
			`args['spellcheck'] = '1'`

			`if brave_category == 'search':`
			`if params.get('pageno', 1) - 1:`
			`args['offset'] = params.get('pageno', 1) - 1`
			`if time_range_map.get(params['time_range']):`
			`args['tf'] = time_range_map.get(params['time_range'])`

			`params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"`

			`# set preferences in cookie`
			`params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')`

			`# ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are`
			`# limited and the country handling has it quirks`

			`eng_locale = params.get('searxng_locale')`
			`params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country'`
			`params['cookies']['summarizer'] = '0'`
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`if not eng_locale or eng_locale == 'all':`
			`params['cookies']['country'] = 'all' # country=all`
			`else:`
			`params['cookies']['country'] = eng_locale.split('-')[-1].lower()`
			`params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()`

			`# logger.debug("cookies %s", params['cookies'])`
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00			`def response(resp):`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00
			`if brave_category == 'search':`
			`return _parse_search(resp)`
[feat] engine: brave - support for images 2023-08-05 17:46:04 +00:00
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00			`datastr = ""`
			`for line in resp.text.split("\n"):`
			`if "const data = " in line:`
			`datastr = line.replace("const data = ", "").strip()[:-1]`
			`break`

			`json_data = chompjs.parse_js_object(datastr)`
[feat] engine: brave - support for news 2023-08-05 18:35:04 +00:00			`json_resp = json_data[1]['data']['body']['response']`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00
			`if brave_category == 'news':`
[feat] engine: brave - support for news 2023-08-05 18:35:04 +00:00			`json_resp = json_resp['news']`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`return _parse_news(json_resp)`

			`if brave_category == 'images':`
			`return _parse_images(json_resp)`
			`if brave_category == 'videos':`
			`return _parse_videos(json_resp)`

			`return []`


			`def _parse_search(resp):`

			`result_list = []`
			`dom = html.fromstring(resp.text)`

			`answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)`
			`if answer_tag:`
			`result_list.append({'answer': extract_text(answer_tag)})`

			`# xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'`
			`xpath_results = '//div[contains(@class, "snippet")]'`

			`for result in eval_xpath_list(dom, xpath_results):`

			`url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)`
			`title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)`
			`if not (url and title_tag):`
			`continue`

			`content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')`
			`img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')`

			`item = {`
			`'url': url,`
			`'title': extract_text(title_tag),`
			`'content': extract_text(content_tag),`
			`'img_src': img_src,`
			`}`

			`video_tag = eval_xpath_getindex(`
			`result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None`
			`)`
			`if video_tag:`

			`# In my tests a video tag in the WEB search was mostoften not a`
			`# video, except the ones from youtube ..`

			`iframe_src = _get_iframe_src(url)`
			`if iframe_src:`
			`item['iframe_src'] = iframe_src`
			`item['template'] = 'videos.html'`
			`item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')`
			`else:`
			`item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')`

			`result_list.append(item)`

			`return result_list`


			`def _get_iframe_src(url):`
			`parsed_url = urlparse(url)`
			`if parsed_url.path == '/watch' and parsed_url.query:`
			`video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore`
			`if video_id:`
			`return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore`
			`return None`


			`def _parse_news(json_resp):`
			`result_list = []`
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00
[feat] engine: brave - support for news 2023-08-05 18:35:04 +00:00			`for result in json_resp["results"]:`
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00			`item = {`
			`'url': result['url'],`
			`'title': result['title'],`
			`'content': result['description'],`
			`}`
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`if result['thumbnail'] != "null":`
			`item['img_src'] = result['thumbnail']['src']`
			`result_list.append(item)`

			`return result_list`


			`def _parse_images(json_resp):`
			`result_list = []`

			`for result in json_resp["results"]:`
			`item = {`
			`'url': result['url'],`
			`'title': result['title'],`
			`'content': result['description'],`
			`'template': 'images.html',`
			`'img_format': result['properties']['format'],`
			`'source': result['source'],`
			`'img_src': result['properties']['url'],`
			`}`
			`result_list.append(item)`

			`return result_list`


			`def _parse_videos(json_resp):`
			`result_list = []`

			`for result in json_resp["results"]:`

			`url = result['url']`
			`item = {`
			`'url': url,`
			`'title': result['title'],`
			`'content': result['description'],`
			`'template': 'videos.html',`
			`'length': result['video']['duration'],`
			`'duration': result['video']['duration'],`
			`}`

[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00			`if result['thumbnail'] != "null":`
			`item['thumbnail'] = result['thumbnail']['src']`

[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`iframe_src = _get_iframe_src(url)`
			`if iframe_src:`
			`item['iframe_src'] = iframe_src`
[feat] engine: brave - support for news 2023-08-05 18:35:04 +00:00
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`result_list.append(item)`
[feat] engine: brave - support for videos 2023-08-05 18:25:10 +00:00
[mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2023-08-06 17:35:56 +00:00			`return result_list`