mirror of https://github.com/searxng/searxng
commit
f46057feb2
@ -0,0 +1,114 @@
|
||||
"""
|
||||
Wikipedia (Web)
|
||||
|
||||
@website https://{language}.wikipedia.org
|
||||
@provide-api yes
|
||||
|
||||
@using-api yes
|
||||
@results JSON
|
||||
@stable yes
|
||||
@parse url, infobox
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib import urlencode, quote
|
||||
|
||||
# search-url
|
||||
base_url = 'https://{language}.wikipedia.org/'
|
||||
search_postfix = 'w/api.php?'\
|
||||
'action=query'\
|
||||
'&format=json'\
|
||||
'&{query}'\
|
||||
'&prop=extracts|pageimages'\
|
||||
'&exintro'\
|
||||
'&explaintext'\
|
||||
'&pithumbsize=300'\
|
||||
'&redirects'
|
||||
|
||||
|
||||
# set language in base_url
|
||||
def url_lang(lang):
|
||||
if lang == 'all':
|
||||
language = 'en'
|
||||
else:
|
||||
language = lang.split('_')[0]
|
||||
|
||||
return base_url.format(language=language)
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
if query.islower():
|
||||
query += '|' + query.title()
|
||||
|
||||
params['url'] = url_lang(params['language']) \
|
||||
+ search_postfix.format(query=urlencode({'titles': query}))
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get first meaningful paragraph
|
||||
# this should filter out disambiguation pages and notes above first paragraph
|
||||
# "magic numbers" were obtained by fine tuning
|
||||
def extract_first_paragraph(content, title, image):
|
||||
first_paragraph = None
|
||||
|
||||
failed_attempts = 0
|
||||
for paragraph in content.split('\n'):
|
||||
|
||||
starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
|
||||
length = len(paragraph)
|
||||
|
||||
if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
|
||||
first_paragraph = paragraph
|
||||
break
|
||||
|
||||
failed_attempts += 1
|
||||
if failed_attempts > 3:
|
||||
return None
|
||||
|
||||
return first_paragraph
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
search_result = loads(resp.content)
|
||||
|
||||
# wikipedia article's unique id
|
||||
# first valid id is assumed to be the requested article
|
||||
for article_id in search_result['query']['pages']:
|
||||
page = search_result['query']['pages'][article_id]
|
||||
if int(article_id) > 0:
|
||||
break
|
||||
|
||||
if int(article_id) < 0:
|
||||
return []
|
||||
|
||||
title = page.get('title')
|
||||
|
||||
image = page.get('thumbnail')
|
||||
if image:
|
||||
image = image.get('source')
|
||||
|
||||
extract = page.get('extract')
|
||||
|
||||
summary = extract_first_paragraph(extract, title, image)
|
||||
if not summary:
|
||||
return []
|
||||
|
||||
# link to wikipedia article
|
||||
# parenthesis are not quoted to make infobox mergeable with wikidata's
|
||||
wikipedia_link = url_lang(resp.search_params['language']) \
|
||||
+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
|
||||
|
||||
results.append({'url': wikipedia_link, 'title': title})
|
||||
|
||||
results.append({'infobox': title,
|
||||
'id': wikipedia_link,
|
||||
'content': summary,
|
||||
'img_src': image,
|
||||
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
|
||||
|
||||
return results
|
@ -0,0 +1,160 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import defaultdict
|
||||
import mock
|
||||
from searx.engines import wikipedia
|
||||
from searx.testing import SearxTestCase
|
||||
|
||||
|
||||
class TestWikipediaEngine(SearxTestCase):
|
||||
|
||||
def test_request(self):
|
||||
query = 'test_query'
|
||||
dicto = defaultdict(dict)
|
||||
dicto['language'] = 'fr_FR'
|
||||
params = wikipedia.request(query, dicto)
|
||||
self.assertIn('url', params)
|
||||
self.assertIn(query, params['url'])
|
||||
self.assertIn('test_query', params['url'])
|
||||
self.assertIn('Test_Query', params['url'])
|
||||
self.assertIn('fr.wikipedia.org', params['url'])
|
||||
|
||||
query = 'Test_Query'
|
||||
params = wikipedia.request(query, dicto)
|
||||
self.assertIn('Test_Query', params['url'])
|
||||
self.assertNotIn('test_query', params['url'])
|
||||
|
||||
dicto['language'] = 'all'
|
||||
params = wikipedia.request(query, dicto)
|
||||
self.assertIn('en', params['url'])
|
||||
|
||||
def test_response(self):
|
||||
dicto = defaultdict(dict)
|
||||
dicto['language'] = 'fr'
|
||||
|
||||
self.assertRaises(AttributeError, wikipedia.response, None)
|
||||
self.assertRaises(AttributeError, wikipedia.response, [])
|
||||
self.assertRaises(AttributeError, wikipedia.response, '')
|
||||
self.assertRaises(AttributeError, wikipedia.response, '[]')
|
||||
|
||||
# page not found
|
||||
json = """
|
||||
{
|
||||
"batchcomplete": "",
|
||||
"query": {
|
||||
"normalized": [],
|
||||
"pages": {
|
||||
"-1": {
|
||||
"ns": 0,
|
||||
"title": "",
|
||||
"missing": ""
|
||||
}
|
||||
}
|
||||
}
|
||||
}"""
|
||||
response = mock.Mock(content=json, search_params=dicto)
|
||||
self.assertEqual(wikipedia.response(response), [])
|
||||
|
||||
# normal case
|
||||
json = """
|
||||
{
|
||||
"batchcomplete": "",
|
||||
"query": {
|
||||
"normalized": [],
|
||||
"pages": {
|
||||
"12345": {
|
||||
"pageid": 12345,
|
||||
"ns": 0,
|
||||
"title": "The Title",
|
||||
"extract": "The Title is...",
|
||||
"thumbnail": {
|
||||
"source": "img_src.jpg"
|
||||
},
|
||||
"pageimage": "img_name.jpg"
|
||||
}
|
||||
}
|
||||
}
|
||||
}"""
|
||||
response = mock.Mock(content=json, search_params=dicto)
|
||||
results = wikipedia.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['title'], u'The Title')
|
||||
self.assertIn('fr.wikipedia.org/wiki/The_Title', results[0]['url'])
|
||||
self.assertEqual(results[1]['infobox'], u'The Title')
|
||||
self.assertIn('fr.wikipedia.org/wiki/The_Title', results[1]['id'])
|
||||
self.assertIn('The Title is...', results[1]['content'])
|
||||
self.assertEqual(results[1]['img_src'], 'img_src.jpg')
|
||||
|
||||
# disambiguation page
|
||||
json = """
|
||||
{
|
||||
"batchcomplete": "",
|
||||
"query": {
|
||||
"normalized": [],
|
||||
"pages": {
|
||||
"12345": {
|
||||
"pageid": 12345,
|
||||
"ns": 0,
|
||||
"title": "The Title",
|
||||
"extract": "The Title can be:\\nThe Title 1\\nThe Title 2\\nThe Title 3\\nThe Title 4......................................................................................................................................." """ # noqa
|
||||
json += """
|
||||
}
|
||||
}
|
||||
}
|
||||
}"""
|
||||
response = mock.Mock(content=json, search_params=dicto)
|
||||
results = wikipedia.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
# no image
|
||||
json = """
|
||||
{
|
||||
"batchcomplete": "",
|
||||
"query": {
|
||||
"normalized": [],
|
||||
"pages": {
|
||||
"12345": {
|
||||
"pageid": 12345,
|
||||
"ns": 0,
|
||||
"title": "The Title",
|
||||
"extract": "The Title is......................................................................................................................................................................................." """ # noqa
|
||||
json += """
|
||||
}
|
||||
}
|
||||
}
|
||||
}"""
|
||||
response = mock.Mock(content=json, search_params=dicto)
|
||||
results = wikipedia.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertIn('The Title is...', results[1]['content'])
|
||||
self.assertEqual(results[1]['img_src'], None)
|
||||
|
||||
# title not in first paragraph
|
||||
json = u"""
|
||||
{
|
||||
"batchcomplete": "",
|
||||
"query": {
|
||||
"normalized": [],
|
||||
"pages": {
|
||||
"12345": {
|
||||
"pageid": 12345,
|
||||
"ns": 0,
|
||||
"title": "披頭四樂隊",
|
||||
"extract": "披头士乐队....................................................................................................................................................................................................\\n披頭四樂隊...", """ # noqa
|
||||
json += """
|
||||
"thumbnail": {
|
||||
"source": "img_src.jpg"
|
||||
},
|
||||
"pageimage": "img_name.jpg"
|
||||
}
|
||||
}
|
||||
}
|
||||
}"""
|
||||
response = mock.Mock(content=json, search_params=dicto)
|
||||
results = wikipedia.response(response)
|
||||
self.assertEqual(type(results), list)
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[1]['infobox'], u'披頭四樂隊')
|
||||
self.assertIn(u'披头士乐队...', results[1]['content'])
|
Loading…
Reference in New Issue