Merge pull request #2224 from dalf/update-infobox-engines

[enh] update infobox engines
dependabot/pip/master/sphinx-6.1.3
Noémi Ványi 4 years ago committed by GitHub
commit 10ddd421f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,7 +2,8 @@ import json
from pathlib import Path from pathlib import Path
__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'bangs_loader', 'ahmia_blacklist_loader'] __init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'EXTERNAL_URLS', 'WIKIDATA_UNITS',
'bangs_loader', 'ahmia_blacklist_loader']
data_dir = Path(__file__).parent data_dir = Path(__file__).parent
@ -24,3 +25,5 @@ def ahmia_blacklist_loader():
ENGINES_LANGUAGES = load('engines_languages.json') ENGINES_LANGUAGES = load('engines_languages.json')
CURRENCIES = load('currencies.json') CURRENCIES = load('currencies.json')
USER_AGENTS = load('useragents.json') USER_AGENTS = load('useragents.json')
EXTERNAL_URLS = load('external_urls.json')
WIKIDATA_UNITS = load('wikidata_units.json')

@ -0,0 +1,156 @@
{
"facebook_profile": {
"category_name": "Facebook",
"url_name": "Facebook profile",
"urls": {
"default": "https://facebook.com/$1"
}
},
"youtube_channel": {
"category_name": "YouTube",
"url_name": "YouTube channel",
"urls": {
"default": "https://www.youtube.com/channel/$1"
}
},
"youtube_video": {
"category_name": "YouTube",
"url_name": "YouTube video",
"urls": {
"default": "https://www.youtube.com/watch?v=$1"
}
},
"twitter_profile": {
"category_name": "Twitter",
"url_name": "Twitter profile",
"urls": {
"default": "https://twitter.com/$1"
}
},
"instagram_profile": {
"category_name": "Instagram",
"url_name": "Instagram profile",
"urls": {
"default": "https://www.instagram.com/$1"
}
},
"imdb_title": {
"category_name": "IMDB",
"url_name": "IMDB title",
"urls": {
"default": "https://www.imdb.com/title/$1"
}
},
"imdb_name": {
"category_name": "IMDB",
"url_name": "IMDB name",
"urls": {
"default": "https://www.imdb.com/name/$1"
}
},
"imdb_character": {
"category_name": "IMDB",
"url_name": "IMDB character",
"urls": {
"default": "https://www.imdb.com/character/$1"
}
},
"imdb_company": {
"category_name": "IMDB",
"url_name": "IMDB company",
"urls": {
"default": "https://www.imdb.com/company/$1"
}
},
"imdb_event": {
"category_name": "IMDB",
"url_name": "IMDB event",
"urls": {
"default": "https://www.imdb.com/event/$1"
}
},
"rotten_tomatoes": {
"category_name": "Rotten tomatoes",
"url_name": "Rotten tomatoes title",
"urls": {
"default": "https://www.rottentomatoes.com/$1"
}
},
"spotify_artist_id": {
"category_name": "Spotify",
"url_name": "Spotify artist",
"urls": {
"default": "https://open.spotify.com/artist/$1"
}
},
"itunes_artist_id": {
"category_name": "iTunes",
"url_name": "iTunes artist",
"urls": {
"default": "https://music.apple.com/us/artist/$1"
}
},
"soundcloud_id": {
"category_name": "Soundcloud",
"url_name": "Soundcloud artist",
"urls": {
"default": "https://soundcloud.com/$1"
}
},
"netflix_id": {
"category_name": "Netflix",
"url_name": "Netflix movie",
"urls": {
"default": "https://www.netflix.com/watch/$1"
}
},
"github_profile": {
"category_name": "Github",
"url_name": "Github profile",
"urls": {
"default": "https://wwww.github.com/$1"
}
},
"musicbrainz_artist": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz artist",
"urls": {
"default": "http://musicbrainz.org/artist/$1"
}
},
"musicbrainz_work": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz work",
"urls": {
"default": "http://musicbrainz.org/work/$1"
}
},
"musicbrainz_release_group": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz release group",
"urls": {
"default": "http://musicbrainz.org/release-group/$1"
}
},
"musicbrainz_label": {
"category_name": "Musicbrainz",
"url_name": "Musicbrainz label",
"urls": {
"default": "http://musicbrainz.org/label/$1"
}
},
"wikimedia_image": {
"category_name": "Wikipedia",
"url_name": "Wikipedia image",
"urls": {
"default": "https://commons.wikimedia.org/wiki/Special:FilePath/$1?width=500&height=400"
}
},
"map": {
"category_name": "Map",
"url_name": "geo map",
"urls": {
"default": "https://www.openstreetmap.org/?lat=${latitude}&lon=${longitude}&zoom=${zoom}&layers=M"
}
}
}

File diff suppressed because it is too large Load Diff

@ -12,28 +12,53 @@ DuckDuckGo (definitions)
import json import json
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from re import compile
from searx import logger
from searx.data import WIKIDATA_UNITS
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
from searx.utils import extract_text, html_to_text, match_language from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
logger = logger.getChild('duckduckgo_definitions')
url = 'https://api.duckduckgo.com/'\ URL = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
http_regex = compile(r'^http:') WIKIDATA_PREFIX = [
'http://www.wikidata.org/entity/',
'https://www.wikidata.org/entity/'
]
replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text):
""" duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
The href URL is broken, the "Related website" may contains some HTML.
def result_to_text(url, text, htmlResult): The best solution seems to ignore these results.
"""
return text.startswith('http') and ' ' in text
def result_to_text(text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category" # TODO : remove result ending with "Meaning" or "Category"
result = None
dom = html.fromstring(htmlResult) dom = html.fromstring(htmlResult)
a = dom.xpath('//a') a = dom.xpath('//a')
if len(a) >= 1: if len(a) >= 1:
return extract_text(a[0]) result = extract_text(a[0])
else: else:
return text result = text
if not is_broken_text(result):
return result
return None
def request(query, params): def request(query, params):
params['url'] = url.format(query=urlencode({'q': query})) params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases) language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0] language = language.split('-')[0]
params['headers']['Accept-Language'] = language params['headers']['Accept-Language'] = language
@ -45,6 +70,14 @@ def response(resp):
search_res = json.loads(resp.text) search_res = json.loads(resp.text)
# search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall
# * actor / musician / artist
# * book / performing art / film / television / media franchise / concert tour / playwright
# * prepared food
# * website / software / os / programming language / file format / software engineer
# * compagny
content = '' content = ''
heading = search_res.get('Heading', '') heading = search_res.get('Heading', '')
attributes = [] attributes = []
@ -55,7 +88,8 @@ def response(resp):
# add answer if there is one # add answer if there is one
answer = search_res.get('Answer', '') answer = search_res.get('Answer', '')
if answer: if answer:
if search_res.get('AnswerType', '') not in ['calc']: logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
if search_res.get('AnswerType') not in ['calc', 'ip']:
results.append({'answer': html_to_text(answer)}) results.append({'answer': html_to_text(answer)})
# add infobox # add infobox
@ -66,42 +100,36 @@ def response(resp):
content = content + search_res.get('Abstract', '') content = content + search_res.get('Abstract', '')
# image # image
image = search_res.get('Image', '') image = search_res.get('Image')
image = None if image == '' else image image = None if image == '' else image
# attributes
if 'Infobox' in search_res:
infobox = search_res.get('Infobox', None)
if 'content' in infobox:
for info in infobox.get('content'):
attributes.append({'label': info.get('label'),
'value': info.get('value')})
# urls # urls
# Official website, Wikipedia page
for ddg_result in search_res.get('Results', []): for ddg_result in search_res.get('Results', []):
if 'FirstURL' in ddg_result: firstURL = ddg_result.get('FirstURL')
firstURL = ddg_result.get('FirstURL', '') text = ddg_result.get('Text')
text = ddg_result.get('Text', '') if firstURL is not None and text is not None:
urls.append({'title': text, 'url': firstURL}) urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL}) results.append({'title': heading, 'url': firstURL})
# related topics # related topics
for ddg_result in search_res.get('RelatedTopics', []): for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result: if 'FirstURL' in ddg_result:
suggestion = result_to_text(ddg_result.get('FirstURL', None), firstURL = ddg_result.get('FirstURL')
ddg_result.get('Text', None), text = ddg_result.get('Text')
ddg_result.get('Result', None)) if not is_broken_text(text):
if suggestion != heading: suggestion = result_to_text(text,
results.append({'suggestion': suggestion}) ddg_result.get('Result'))
if suggestion != heading and suggestion is not None:
results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result: elif 'Topics' in ddg_result:
suggestions = [] suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''), relatedTopics.append({'name': ddg_result.get('Name', ''),
'suggestions': suggestions}) 'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []): for topic_result in ddg_result.get('Topics', []):
suggestion = result_to_text(topic_result.get('FirstURL', None), suggestion = result_to_text(topic_result.get('Text'),
topic_result.get('Text', None), topic_result.get('Result'))
topic_result.get('Result', None)) if suggestion != heading and suggestion is not None:
if suggestion != heading:
suggestions.append(suggestion) suggestions.append(suggestion)
# abstract # abstract
@ -110,7 +138,10 @@ def response(resp):
# add as result ? problem always in english # add as result ? problem always in english
infobox_id = abstractURL infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'), urls.append({'title': search_res.get('AbstractSource'),
'url': abstractURL}) 'url': abstractURL,
'official': True})
results.append({'url': abstractURL,
'title': heading})
# definition # definition
definitionURL = search_res.get('DefinitionURL', '') definitionURL = search_res.get('DefinitionURL', '')
@ -118,53 +149,107 @@ def response(resp):
# add as result ? as answer ? problem always in english # add as result ? as answer ? problem always in english
infobox_id = definitionURL infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'), urls.append({'title': search_res.get('DefinitionSource'),
'url': definitionURL}) 'url': definitionURL})
# to merge with wikidata's infobox # to merge with wikidata's infobox
if infobox_id: if infobox_id:
infobox_id = http_regex.sub('https:', infobox_id) infobox_id = replace_http_by_https(infobox_id)
# entity # attributes
entity = search_res.get('Entity', None) # some will be converted to urls
# TODO continent / country / department / location / waterfall / if 'Infobox' in search_res:
# mountain range : infobox = search_res.get('Infobox')
# link to map search, get weather, near by locations if 'content' in infobox:
# TODO musician : link to music search osm_zoom = 17
# TODO concert tour : ?? coordinates = None
# TODO film / actor / television / media franchise : for info in infobox.get('content'):
# links to IMDB / rottentomatoes (or scrap result) data_type = info.get('data_type')
# TODO music : link tu musicbrainz / last.fm data_label = info.get('label')
# TODO book : ?? data_value = info.get('value')
# TODO artist / playwright : ??
# TODO compagny : ?? # Workaround: ddg may return a double quote
# TODO software / os : ?? if data_value == '""':
# TODO software engineer : ?? continue
# TODO prepared food : ??
# TODO website : ?? # Is it an external URL ?
# TODO performing art : ?? # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
# TODO prepared food : ?? # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
# TODO programming language : ?? # * netflix_id
# TODO file format : ?? external_url = get_external_url(data_type, data_value)
if external_url is not None:
urls.append({'title': data_label,
'url': external_url})
elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
# ignore instance: Wikidata value from "Instance Of" (Qxxxx)
# ignore wiki_maps_trigger: reference to a javascript
# ignore google_play_artist_id: service shutdown
pass
elif data_type == 'string' and data_label == 'Website':
# There is already an URL for the website
pass
elif data_type == 'area':
attributes.append({'label': data_label,
'value': area_to_str(data_value),
'entity': 'P2046'})
osm_zoom = area_to_osm_zoom(data_value.get('amount'))
elif data_type == 'coordinates':
if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
# coordinate on Earth
# get the zoom information from the area
coordinates = info
else:
# coordinate NOT on Earth
attributes.append({'label': data_label,
'value': data_value,
'entity': 'P625'})
elif data_type == 'string':
attributes.append({'label': data_label,
'value': data_value})
if coordinates:
data_label = coordinates.get('label')
data_value = coordinates.get('value')
latitude = data_value.get('latitude')
longitude = data_value.get('longitude')
url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
urls.append({'title': 'OpenStreetMap',
'url': url,
'entity': 'P625'})
if len(heading) > 0: if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title' # TODO get infobox.meta.value where .label='article_title'
if image is None and len(attributes) == 0 and len(urls) == 1 and\ if image is None and len(attributes) == 0 and len(urls) == 1 and\
len(relatedTopics) == 0 and len(content) == 0: len(relatedTopics) == 0 and len(content) == 0:
results.append({ results.append({'url': urls[0]['url'],
'url': urls[0]['url'], 'title': heading,
'title': heading, 'content': content})
'content': content
})
else: else:
results.append({ results.append({'infobox': heading,
'infobox': heading, 'id': infobox_id,
'id': infobox_id, 'content': content,
'entity': entity, 'img_src': image,
'content': content, 'attributes': attributes,
'img_src': image, 'urls': urls,
'attributes': attributes, 'relatedTopics': relatedTopics})
'urls': urls,
'relatedTopics': relatedTopics
})
return results return results
def unit_to_str(unit):
for prefix in WIKIDATA_PREFIX:
if unit.startswith(prefix):
wikidata_entity = unit[len(prefix):]
return WIKIDATA_UNITS.get(wikidata_entity, unit)
return unit
def area_to_str(area):
"""parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
unit = unit_to_str(area.get('unit'))
if unit is not None:
try:
amount = float(area.get('amount'))
return '{} {}'.format(amount, unit)
except ValueError:
pass
return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))

File diff suppressed because it is too large Load Diff

@ -0,0 +1,77 @@
import math
from searx.data import EXTERNAL_URLS
IMDB_PREFIX_TO_URL_ID = {
'tt': 'imdb_title',
'mn': 'imdb_name',
'ch': 'imdb_character',
'co': 'imdb_company',
'ev': 'imdb_event'
}
def get_imdb_url_id(imdb_item_id):
id_prefix = imdb_item_id[:2]
return IMDB_PREFIX_TO_URL_ID.get(id_prefix)
def get_external_url(url_id, item_id, alternative="default"):
"""Return an external URL or None if url_id is not found.
url_id can take value from data/external_urls.json
The "imdb_id" value is automaticaly converted according to the item_id value.
If item_id is None, the raw URL with the $1 is returned.
"""
if url_id == 'imdb_id' and item_id is not None:
url_id = get_imdb_url_id(item_id)
url_description = EXTERNAL_URLS.get(url_id)
if url_description:
url_template = url_description["urls"].get(alternative)
if url_template is not None:
if item_id is not None:
return url_template.replace('$1', item_id)
else:
return url_template
return None
def get_earth_coordinates_url(latitude, longitude, osm_zoom, alternative='default'):
url = get_external_url('map', None, alternative)\
.replace('${latitude}', str(latitude))\
.replace('${longitude}', str(longitude))\
.replace('${zoom}', str(osm_zoom))
return url
def area_to_osm_zoom(area):
"""Convert an area in km² into an OSM zoom. Less reliable if the shape is not round.
logarithm regression using these data:
* 9596961 -> 4 (China)
* 3287263 -> 5 (India)
* 643801 -> 6 (France)
* 6028 -> 9
* 1214 -> 10
* 891 -> 12
* 12 -> 13
In WolframAlpha:
>>> log fit {9596961,15},{3287263, 14},{643801,13},{6028,10},{1214,9},{891,7},{12,6}
with 15 = 19-4 (China); 14 = 19-5 (India) and so on
Args:
area (int,float,str): area in km²
Returns:
int: OSM zoom or 19 in area is not a number
"""
try:
amount = float(area)
return max(0, min(19, round(19 - 0.688297 * math.log(226.878 * amount))))
except ValueError:
return 19

@ -20,6 +20,18 @@ def result_content_len(content):
def compare_urls(url_a, url_b): def compare_urls(url_a, url_b):
"""Lazy compare between two URL.
"www.example.com" and "example.com" are equals.
"www.example.com/path/" and "www.example.com/path" are equals.
"https://www.example.com/" and "http://www.example.com/" are equals.
Args:
url_a (ParseResult): first URL
url_b (ParseResult): second URL
Returns:
bool: True if url_a and url_b are equals
"""
# ignore www. in comparison # ignore www. in comparison
if url_a.netloc.startswith('www.'): if url_a.netloc.startswith('www.'):
host_a = url_a.netloc.replace('www.', '', 1) host_a = url_a.netloc.replace('www.', '', 1)
@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
for url2 in infobox2.get('urls', []): for url2 in infobox2.get('urls', []):
unique_url = True unique_url = True
parsed_url2 = urlparse(url2.get('url', '')) parsed_url2 = urlparse(url2.get('url', ''))
entity_url2 = url2.get('entity')
for url1 in urls1: for url1 in urls1:
if compare_urls(urlparse(url1.get('url', '')), parsed_url2): if (entity_url2 is not None and url1.get('entity') == entity_url2)\
or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
unique_url = False unique_url = False
break break
if unique_url: if unique_url:
@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['img_src'] = img2 infobox1['img_src'] = img2
if 'attributes' in infobox2: if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes', None) attributes1 = infobox1.get('attributes')
if attributes1 is None: if attributes1 is None:
attributes1 = [] infobox1['attributes'] = attributes1 = []
infobox1['attributes'] = attributes1
attributeSet = set() attributeSet = set()
for attribute in infobox1.get('attributes', []): for attribute in attributes1:
if attribute.get('label', None) not in attributeSet: label = attribute.get('label')
attributeSet.add(attribute.get('label', None)) if label not in attributeSet:
attributeSet.add(label)
entity = attribute.get('entity')
if entity not in attributeSet:
attributeSet.add(entity)
for attribute in infobox2.get('attributes', []): for attribute in infobox2.get('attributes', []):
if attribute.get('label', None) not in attributeSet: if attribute.get('label') not in attributeSet\
and attribute.get('entity') not in attributeSet:
attributes1.append(attribute) attributes1.append(attribute)
if 'content' in infobox2: if 'content' in infobox2:

@ -25,11 +25,7 @@
{%- if attribute.image -%} {%- if attribute.image -%}
<td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
{%- else -%} {%- else -%}
{% if attribute.label == 'Instance of' %} <td><bdi>{{ attribute.value }}</bdi></td>
<td><bdi><a href="https://wikidata.org/wiki/{{ attribute.value.id }}">{{ attribute.value.id }}</a></bdi></td>
{% else %}
<td><bdi>{{ attribute.value }}</bdi></td>
{%- endif -%}
{%- endif -%} {%- endif -%}
</tr> </tr>
{% endfor -%} {% endfor -%}

@ -1,7 +1,6 @@
<aside class="infobox"> <aside class="infobox">
<h2><bdi>{{ infobox.infobox }}</bdi></h2> <h2><bdi>{{ infobox.infobox }}</bdi></h2>
{% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %} {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %}
<p><bdi>{{ infobox.entity }}</bdi></p>
<p><bdi>{{ infobox.content | safe }}</bdi></p> <p><bdi>{{ infobox.content | safe }}</bdi></p>
{% if infobox.attributes %} {% if infobox.attributes %}
<div class="attributes"> <div class="attributes">

@ -481,6 +481,16 @@ def ecma_unescape(s):
return s return s
def get_string_replaces_function(replaces):
rep = {re.escape(k): v for k, v in replaces.items()}
pattern = re.compile("|".join(rep.keys()))
def f(text):
return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
return f
def get_engine_from_settings(name): def get_engine_from_settings(name):
"""Return engine configuration from settings.yml of a given engine name""" """Return engine configuration from settings.yml of a given engine name"""

@ -0,0 +1,47 @@
#!/usr/bin/env python
import json
import collections
# set path
from sys import path
from os.path import realpath, dirname, join
path.append(realpath(dirname(realpath(__file__)) + '/../'))
from searx import searx_dir
from searx.engines.wikidata import send_wikidata_query
SARQL_REQUEST = """
SELECT DISTINCT ?item ?symbol ?P2370 ?P2370Unit ?P2442 ?P2442Unit
WHERE
{
?item wdt:P31/wdt:P279 wd:Q47574.
?item wdt:P5061 ?symbol.
FILTER(LANG(?symbol) = "en").
}
ORDER BY ?item
"""
def get_data():
def get_key(unit):
return unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
def get_value(unit):
return unit['symbol']['value']
result = send_wikidata_query(SARQL_REQUEST)
if result is not None:
# sort the unit by entity name
# so different fetchs keep the file unchanged.
list(result['results']['bindings']).sort(key=get_key)
return collections.OrderedDict([(get_key(unit), get_value(unit)) for unit in result['results']['bindings']])
def get_wikidata_units_filename():
return join(join(searx_dir, "data"), "wikidata_units.json")
with open(get_wikidata_units_filename(), 'w') as f:
json.dump(get_data(), f, indent=4, ensure_ascii=False)
Loading…
Cancel
Save