[enh] fix content fetching, parse published date from description

pull/1/head
Thomas Pointhuber 9 years ago
parent a959977ab4
commit 4508c96667

@ -12,6 +12,8 @@
from lxml import html from lxml import html
from cgi import escape from cgi import escape
from dateutil import parser
from datetime import datetime, timedelta
import re import re
from searx.engines.xpath import extract_text from searx.engines.xpath import extract_text
@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link)) title = escape(extract_text(link))
if result.xpath('./p[@class="desc"]'): if result.xpath('./p[@class="desc clk"]'):
content = escape(extract_text(result.xpath('./p[@class="desc"]'))) content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else: else:
content = '' content = ''
# append result published_date = None
results.append({'url': url,
'title': title, # check if search result starts with something like: "2 Sep 2014 ... "
'content': content}) if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
published_date = parser.parse(date_string, dayfirst=True)
# fix content string
content = content[date_pos:]
# check if search result starts with something like: "5 days ago ... "
elif re.match("^[0-9]+ days? ago \.\.\. ", content):
date_pos = content.find('...')+4
date_string = content[0:date_pos-5]
# calculate datetime
published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
# fix content string
content = content[date_pos:]
if published_date:
# append result
results.append({'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
else:
# append result
results.append({'url': url,
'title': title,
'content': content})
# return results # return results
return results return results

@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>
@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
</a> </a>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>
@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
<h3> <h3>
<span id='title_stars_2' name='title_stars_2'> </span> <span id='title_stars_2' name='title_stars_2'> </span>
</h3> </h3>
<p class='desc'> <p class='desc clk'>
This should be the content. This should be the content.
</p> </p>
<p> <p>

Loading…
Cancel
Save