diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 228a3028f..495a0cd23 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,29 +1,61 @@ -from json import loads from urllib import urlencode +from lxml.html import fromstring from searx.utils import html_to_text -url = 'https://duckduckgo.com/' -search_url = url + 'd.js?{query}&p=1&s={offset}' +url = 'https://duckduckgo.com/html?{query}&s={offset}' locale = 'us-en' -paging = True - - def request(query, params): offset = (params['pageno'] - 1) * 30 q = urlencode({'q': query, 'l': locale}) - params['url'] = search_url.format(query=q, offset=offset) + params['url'] = url.format(query=q, offset=offset) return params def response(resp): + result_xpath = '//div[@class="results_links results_links_deep web-result"]' + url_xpath = './/a[@class="large"]/@href' + title_xpath = './/a[@class="large"]//text()' + content_xpath = './/div[@class="snippet"]//text()' results = [] - search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] - for r in search_res: - if not r.get('t'): + + doc = fromstring(resp.text) + + for r in doc.xpath(result_xpath): + res_url = r.xpath(url_xpath)[-1] + if not res_url: continue - results.append({'title': r['t'], - 'content': html_to_text(r['a']), - 'url': r['u']}) + title = html_to_text(''.join(r.xpath(title_xpath))) + content = html_to_text(''.join(r.xpath(content_xpath))) + results.append({'title': title, + 'content': content, + 'url': res_url}) + return results + + +#from json import loads +#search_url = url + 'd.js?{query}&p=1&s={offset}' +# +#paging = True +# +# +#def request(query, params): +# offset = (params['pageno'] - 1) * 30 +# q = urlencode({'q': query, +# 'l': locale}) +# params['url'] = search_url.format(query=q, offset=offset) +# return params +# +# +#def response(resp): +# results = [] +# search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1] +# for r in search_res: +# if not r.get('t'): +# continue +# results.append({'title': r['t'], +# 'content': html_to_text(r['a']), +# 'url': r['u']}) +# return results