[fix] dailymotion engine: filter by language & country

- fix the issue of fetching more the 7000 *languages*
- improve the request function and filter by language & country
- implement time_range_support & safesearch
- add more fields to the response from dailymotion (allow_embed, length)
- better clean up of HTML tags in the 'content' field.

This is more or less a complete rework based on the '/videos' API from [1].
This patch cleans up the language list in SearXNG that has been polluted by the
ISO-639-3 2 and 3 letter codes from dailymotion languages which have never been
used.

[1] https://developers.dailymotion.com/tools/

Closes: https://github.com/searxng/searxng/issues/1065
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
dependabot/pip/master/sphinx-6.1.3
Markus Heiser 2 years ago
parent 27f8fa6fe0
commit 3bb62823ec

File diff suppressed because it is too large Load Diff

@ -1,12 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Dailymotion (Videos)
Dailymotion (Videos)
""" """
from json import loads from typing import Set
from datetime import datetime from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import match_language, html_to_text import time
import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.utils import html_to_text
# about # about
about = { about = {
@ -21,23 +26,78 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
# search-url safesearch = True
# see http://www.dailymotion.com/doc/api/obj-video.html safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa
supported_languages_url = 'https://api.dailymotion.com/languages'
# search-url
# - https://developers.dailymotion.com/tools/
# - https://www.dailymotion.com/doc/api/obj-video.html
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
search_url = (
'https://api.dailymotion.com/videos?'
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
).format(
fields=','.join(result_fields),
password_protected= 'false',
private='false',
sort='relevance',
limit=number_of_results,
)
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
# The request query filters by 'languages' & 'country', therefore instead of
# fetching only languages we need to fetch locales.
supported_languages_url = 'https://api.dailymotion.com/locales'
# do search-request
def request(query, params): def request(query, params):
if params['language'] == 'all':
locale = 'en-US'
else:
locale = match_language(params['language'], supported_languages)
params['url'] = search_url.format( if not query:
query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno'] return False
)
language = params['language']
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
query_args = {
'search': query,
'languages': locale.language,
'page': params['pageno'],
}
if locale.territory:
localization = locale.language + '_' + locale.territory
if localization in supported_languages:
query_args['country'] = locale.territory
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
query_args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(query_args)
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
params['raise_for_httperror'] = False
return params return params
@ -46,34 +106,51 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = resp.json()
# check for an API error
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
# return empty array if there are no results raise_for_httperror(resp)
if 'list' not in search_res:
return []
# parse results # parse results
for res in search_res['list']: for res in search_res.get('list', []):
title = res['title'] title = res['title']
url = res['url'] url = res['url']
content = html_to_text(res['description']) content = html_to_text(res['description'])
thumbnail = res['thumbnail_360_url'] if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None) publishedDate = datetime.fromtimestamp(res['created_time'], None)
# http to https length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://") thumbnail = thumbnail.replace("http://", "https://")
results.append( item = {
{ 'template': 'videos.html',
'template': 'videos.html', 'url': url,
'url': url, 'title': title,
'title': title, 'content': content,
'content': content, 'publishedDate': publishedDate,
'publishedDate': publishedDate, 'length': length,
'iframe_src': "https://www.dailymotion.com/embed/video/" + res['id'], 'thumbnail': thumbnail,
'thumbnail': thumbnail, }
}
) # HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results # return results
return results return results
@ -81,18 +158,8 @@ def response(resp):
# get supported languages from their site # get supported languages from their site
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = {} response_json = resp.json()
return [
response_json = loads(resp.text) item['locale']
for item in response_json['list']
for language in response_json['list']: ]
supported_languages[language['code']] = {}
name = language['native_name']
if name:
supported_languages[language['code']]['name'] = name
english_name = language['name']
if english_name:
supported_languages[language['code']]['english_name'] = english_name
return supported_languages

@ -2,9 +2,7 @@
# list of language codes # list of language codes
# this file is generated automatically by utils/fetch_languages.py # this file is generated automatically by utils/fetch_languages.py
language_codes = ( language_codes = (
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'), ('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'), ('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'), ('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'), ('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
@ -28,20 +26,15 @@ language_codes = (
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'), ('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'), ('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
('fr', 'Français', '', 'French', '\U0001f310'), ('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'), ('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'), ('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'), ('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'), ('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
@ -63,13 +56,10 @@ language_codes = (
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'), ('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'), ('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'), ('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
('zh', '中文', '', 'Chinese', '\U0001f310'), ('zh', '中文', '', 'Chinese', '\U0001f310'),
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'), ('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),

@ -129,8 +129,8 @@ class TestLanguageParser(SearxTestCase):
query = RawTextQuery(':hu-H', []) query = RawTextQuery(':hu-H', [])
self.assertEqual(query.autocomplete_list, [":hu-hu"]) self.assertEqual(query.autocomplete_list, [":hu-hu"])
query = RawTextQuery(':v', []) query = RawTextQuery(':zh-', [])
self.assertEqual(query.autocomplete_list, [':vi', ':tiếng việt', ':việt_nam']) self.assertEqual(query.autocomplete_list, [':zh-cn', ':zh-hk', ':zh-tw'])
class TestTimeoutParser(SearxTestCase): class TestTimeoutParser(SearxTestCase):

Loading…
Cancel
Save