Merge pull request #354 from asciimoo/bing_news

Bing news based on RSS output
pull/1/head
Adam Tauber 9 years ago
commit aa1b8fa79b

@ -6,18 +6,17 @@
max. 5000 query/month max. 5000 query/month
@using-api no (because of query limit) @using-api no (because of query limit)
@results HTML (using search portal) @results RSS (using search portal)
@stable no (HTML can change) @stable yes (except perhaps for the images)
@parse url, title, content, publishedDate @parse url, title, content, publishedDate, thumbnail
""" """
from urllib import urlencode from urllib import urlencode
from cgi import escape from urlparse import urlparse, parse_qsl
from lxml import html from datetime import datetime
from datetime import datetime, timedelta
from dateutil import parser from dateutil import parser
import re from lxml import etree
from searx.engines.xpath import extract_text from searx.utils import list_get
# engine dependent config # engine dependent config
categories = ['news'] categories = ['news']
@ -26,7 +25,25 @@ language_support = True
# search-url # search-url
base_url = 'https://www.bing.com/' base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}' search_string = 'news/search?{query}&first={offset}&format=RSS'
# remove click
def url_cleanup(url_string):
parsed_url = urlparse(url_string)
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
query = dict(parse_qsl(parsed_url.query))
return query.get('url', None)
return url_string
# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
def image_url_cleanup(url_string):
parsed_url = urlparse(url_string)
if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
query = dict(parse_qsl(parsed_url.query))
return "https://www.bing.com/th?id=" + query.get('id')
return url_string
# do search-request # do search-request
@ -42,8 +59,6 @@ def request(query, params):
query=urlencode({'q': query, 'setmkt': language}), query=urlencode({'q': query, 'setmkt': language}),
offset=offset) offset=offset)
params['cookies']['_FP'] = "ui=en-US"
params['url'] = base_url + search_path params['url'] = base_url + search_path
return params return params
@ -53,50 +68,44 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.content) rss = etree.fromstring(resp.content)
ns = rss.nsmap
# parse results # parse results
for result in dom.xpath('//div[@class="sn_r"]'): for item in rss.xpath('./channel/item'):
link = result.xpath('.//div[@class="newstitle"]/a')[0] # url / title / content
url = link.attrib.get('href') url = url_cleanup(item.xpath('./link/text()')[0])
title = extract_text(link) title = list_get(item.xpath('./title/text()'), 0, url)
contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]') content = list_get(item.xpath('./description/text()'), 0, '')
content = escape(extract_text(contentXPath))
# publishedDate
# parse publishedDate publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div' try:
'//div[contains(@class,"sn_ST")]' publishedDate = parser.parse(publishedDate, dayfirst=False)
'//span[contains(@class,"sn_tm")]') except TypeError:
publishedDate = datetime.now()
publishedDate = escape(extract_text(publishedDateXPath)) except ValueError:
publishedDate = datetime.now()
if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate) # thumbnail
publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0])) thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
elif re.match("^[0-9]+ hour(s|) ago$", publishedDate): if thumbnail is not None:
timeNumbers = re.findall(r'\d+', publishedDate) thumbnail = image_url_cleanup(thumbnail)
publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
else:
try:
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
except ValueError:
publishedDate = datetime.now()
# append result # append result
results.append({'url': url, if thumbnail is not None:
'title': title, results.append({'template': 'videos.html',
'publishedDate': publishedDate, 'url': url,
'content': content}) 'title': title,
'publishedDate': publishedDate,
'content': content,
'thumbnail': thumbnail})
else:
results.append({'url': url,
'title': title,
'publishedDate': publishedDate,
'content': content})
# return results # return results
return results return results

@ -11,6 +11,7 @@
from urllib import quote_plus from urllib import quote_plus
from lxml import html from lxml import html
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
from searx.utils import list_get
# engine dependent config # engine dependent config
categories = ['videos', 'music'] categories = ['videos', 'music']
@ -34,14 +35,6 @@ title_xpath = './/div[@class="yt-lockup-content"]/h3/a'
content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]' content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]'
# get element in list or default value
def list_get(a_list, index, default=None):
if len(a_list) > index:
return a_list[index]
else:
return default
# returns extract_text on the first result selected by the xpath or None # returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom(result, xpath): def extract_text_from_dom(result, xpath):
r = result.xpath(xpath) r = result.xpath(xpath)

@ -2,6 +2,7 @@ from collections import defaultdict
import mock import mock
from searx.engines import bing_news from searx.engines import bing_news
from searx.testing import SearxTestCase from searx.testing import SearxTestCase
import lxml
class TestBingNewsEngine(SearxTestCase): class TestBingNewsEngine(SearxTestCase):
@ -16,14 +17,10 @@ class TestBingNewsEngine(SearxTestCase):
self.assertIn(query, params['url']) self.assertIn(query, params['url'])
self.assertIn('bing.com', params['url']) self.assertIn('bing.com', params['url'])
self.assertIn('fr', params['url']) self.assertIn('fr', params['url'])
self.assertIn('_FP', params['cookies'])
self.assertIn('en', params['cookies']['_FP'])
dicto['language'] = 'all' dicto['language'] = 'all'
params = bing_news.request(query, dicto) params = bing_news.request(query, dicto)
self.assertIn('en', params['url']) self.assertIn('en', params['url'])
self.assertIn('_FP', params['cookies'])
self.assertIn('en', params['cookies']['_FP'])
def test_response(self): def test_response(self):
self.assertRaises(AttributeError, bing_news.response, None) self.assertRaises(AttributeError, bing_news.response, None)
@ -37,200 +34,105 @@ class TestBingNewsEngine(SearxTestCase):
response = mock.Mock(content='<html></html>') response = mock.Mock(content='<html></html>')
self.assertEqual(bing_news.response(response), []) self.assertEqual(bing_news.response(response), [])
html = """ html = """<?xml version="1.0" encoding="utf-8" ?>
<div class="sn_r"> <rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
<div class="newstitle"> <channel>
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1"> <title>python - Bing News</title>
Title <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> <description>Search results</description>
</div> <image>
<div class="sn_img"> <url>http://10.53.64.9/rsslogo.gif</url>
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1"> <title>test</title>
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" /> <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> </image>
</div> <copyright>Copyright</copyright>
<div class="sn_txt"> <item>
<div class="sn_oi"> <title>Title</title>
<span class="sn_snip">Article Content</span> <link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&amp;aid=&amp;tid=c237eccc50bd4758b106a5e3c94fce09&amp;url=http%3a%2f%2furl.of.article%2f&amp;c=xxxxxxxxx&amp;mkt=en-us</link>
<div class="sn_ST"> <description>Article Content</description>
<cite class="sn_src">metronews.fr</cite> <pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
&nbsp;&#0183;&#32; <News:Source>Infoworld</News:Source>
<span class="sn_tm">44 minutes ago</span> <News:Image>http://a1.bing4.com/th?id=ON.13371337133713371337133713371337&amp;pid=News</News:Image>
</div> <News:ImageSize>w={0}&amp;h={1}&amp;c=7</News:ImageSize>
</div> <News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
</div> <News:ImageMaxWidth>620</News:ImageMaxWidth>
</div> <News:ImageMaxHeight>413</News:ImageMaxHeight>
""" </item>
<item>
<title>Another Title</title>
<link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&amp;aid=&amp;tid=c237eccc50bd4758b106a5e3c94fce09&amp;url=http%3a%2f%2fanother.url.of.article%2f&amp;c=xxxxxxxxx&amp;mkt=en-us</link>
<description>Another Article Content</description>
<pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
</item>
</channel>
</rss>""" # noqa
response = mock.Mock(content=html) response = mock.Mock(content=html)
results = bing_news.response(response) results = bing_news.response(response)
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 1) self.assertEqual(len(results), 2)
self.assertEqual(results[0]['title'], 'Title') self.assertEqual(results[0]['title'], 'Title')
self.assertEqual(results[0]['url'], 'http://url.of.article/') self.assertEqual(results[0]['url'], 'http://url.of.article/')
self.assertEqual(results[0]['content'], 'Article Content') self.assertEqual(results[0]['content'], 'Article Content')
self.assertEqual(results[0]['thumbnail'], 'https://www.bing.com/th?id=ON.13371337133713371337133713371337')
self.assertEqual(results[1]['title'], 'Another Title')
self.assertEqual(results[1]['url'], 'http://another.url.of.article/')
self.assertEqual(results[1]['content'], 'Another Article Content')
self.assertNotIn('thumbnail', results[1])
html = """ html = """<?xml version="1.0" encoding="utf-8" ?>
<div class="sn_r"> <rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
<div class="newstitle"> <channel>
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1"> <title>python - Bing News</title>
Title <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> <description>Search results</description>
</div> <image>
<div class="sn_img"> <url>http://10.53.64.9/rsslogo.gif</url>
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1"> <title>test</title>
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" /> <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</a> </image>
</div> <copyright>Copyright</copyright>
<div class="sn_txt"> <item>
<div class="sn_oi"> <title>Title</title>
<span class="sn_snip">Article Content</span> <link>http://another.url.of.article/</link>
<div class="sn_ST"> <description>Article Content</description>
<cite class="sn_src">metronews.fr</cite> <pubDate>garbage</pubDate>
&nbsp;&#0183;&#32; <News:Source>Infoworld</News:Source>
<span class="sn_tm">44 minutes ago</span> <News:Image>http://another.bing.com/image</News:Image>
</div> <News:ImageSize>w={0}&amp;h={1}&amp;c=7</News:ImageSize>
</div> <News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
</div> <News:ImageMaxWidth>620</News:ImageMaxWidth>
</div> <News:ImageMaxHeight>413</News:ImageMaxHeight>
<div class="sn_r"> </item>
<div class="newstitle"> </channel>
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1"> </rss>""" # noqa
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">3 hours, 44 minutes ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">44 hours ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">2 days ago</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">27/01/2015</span>
</div>
</div>
</div>
</div>
<div class="sn_r">
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">Il y a 3 heures</span>
</div>
</div>
</div>
</div>
"""
response = mock.Mock(content=html) response = mock.Mock(content=html)
results = bing_news.response(response) results = bing_news.response(response)
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 6) self.assertEqual(len(results), 1)
self.assertEqual(results[0]['title'], 'Title')
self.assertEqual(results[0]['url'], 'http://another.url.of.article/')
self.assertEqual(results[0]['content'], 'Article Content')
self.assertEqual(results[0]['thumbnail'], 'http://another.bing.com/image')
html = """<?xml version="1.0" encoding="utf-8" ?>
<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
<channel>
<title>python - Bing News</title>
<link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
<description>Search results</description>
<image>
<url>http://10.53.64.9/rsslogo.gif</url>
<title>test</title>
<link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
</image>
</channel>
</rss>""" # noqa
html = """
<div class="newstitle">
<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
Title
</a>
</div>
<div class="sn_img">
<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
</a>
</div>
<div class="sn_txt">
<div class="sn_oi">
<span class="sn_snip">Article Content</span>
<div class="sn_ST">
<cite class="sn_src">metronews.fr</cite>
&nbsp;&#0183;&#32;
<span class="sn_tm">44 minutes ago</span>
</div>
</div>
</div>
"""
response = mock.Mock(content=html) response = mock.Mock(content=html)
results = bing_news.response(response) results = bing_news.response(response)
self.assertEqual(type(results), list) self.assertEqual(type(results), list)
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
html = """<?xml version="1.0" encoding="utf-8" ?>gabarge"""
response = mock.Mock(content=html)
self.assertRaises(lxml.etree.XMLSyntaxError, bing_news.response, response)

@ -228,6 +228,14 @@ def prettify_url(url):
return url return url
# get element in list or default value
def list_get(a_list, index, default=None):
if len(a_list) > index:
return a_list[index]
else:
return default
def get_blocked_engines(engines, cookies): def get_blocked_engines(engines, cookies):
if 'blocked_engines' not in cookies: if 'blocked_engines' not in cookies:
return [(engine_name, category) for engine_name in engines return [(engine_name, category) for engine_name in engines

Loading…
Cancel
Save