Science category: update the engines

* use the paper.html template
* fetch more data from the engines
* add crossref.py
dependabot/pip/master/sphinx-6.1.3
Alexandre FLAMENT 2 years ago committed by Alexandre Flament
parent 593026ad9c
commit e36f85b836

@ -3,9 +3,10 @@
ArXiV (Scientific preprints) ArXiV (Scientific preprints)
""" """
from lxml import html from lxml import etree
from lxml.etree import XPath
from datetime import datetime from datetime import datetime
from searx.utils import eval_xpath_list, eval_xpath_getindex from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about # about
about = { about = {
@ -17,7 +18,7 @@ about = {
"results": 'XML-RSS', "results": 'XML-RSS',
} }
categories = ['science'] categories = ['science', 'scientific publications']
paging = True paging = True
base_url = ( base_url = (
@ -27,6 +28,23 @@ base_url = (
# engine dependent config # engine dependent config
number_of_results = 10 number_of_results = 10
# xpaths
arxiv_namespaces = {
"atom": "http://www.w3.org/2005/Atom",
"arxiv": "http://arxiv.org/schemas/atom",
}
xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
def request(query, params): def request(query, params):
# basic search # basic search
@ -41,30 +59,50 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
dom = etree.fromstring(resp.content)
dom = html.fromstring(resp.content) for entry in eval_xpath_list(dom, xpath_entry):
title = eval_xpath_getindex(entry, xpath_title, 0).text
for entry in eval_xpath_list(dom, '//entry'):
title = eval_xpath_getindex(entry, './/title', 0).text url = eval_xpath_getindex(entry, xpath_id, 0).text
abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
url = eval_xpath_getindex(entry, './/id', 0).text
authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
content_string = '{doi_content}{abstract_content}'
# doi
abstract = eval_xpath_getindex(entry, './/summary', 0).text doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
doi = None if doi_element is None else doi_element.text
# If a doi is available, add it to the snipppet
doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) # pdf
doi_content = doi_element.text if doi_element is not None else '' pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
content = content_string.format(doi_content=doi_content, abstract_content=abstract) pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
if len(content) > 300: # journal
content = content[0:300] + "..." journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
# TODO: center snippet on query term journal = None if journal_element is None else journal_element.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') # tags
tag_elements = eval_xpath(entry, xpath_category)
res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} tags = [str(tag) for tag in tag_elements]
# comments
comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
comments = None if comments_elements is None else comments_elements.text
publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
res_dict = {
'template': 'paper.html',
'url': url,
'title': title,
'publishedDate': publishedDate,
'content': abstract,
'doi': doi,
'authors': authors,
'journal': journal,
'tags': tags,
'comments': comments,
'pdf_url': pdf_url,
}
results.append(res_dict) results.append(res_dict)

@ -0,0 +1,59 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
from urllib.parse import urlencode
from searx.utils import html_to_text
about = {
"website": 'https://www.crossref.org/',
"wikidata_id": 'Q5188229',
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
categories = ['science', 'scientific publications']
paging = True
search_url = 'https://api.crossref.org/works'
def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
return params
def response(resp):
res = resp.json()
results = []
for record in res['message']['items']:
record_type = record['type']
if record_type == 'book-chapter':
title = record['container-title'][0]
if record['title'][0].lower().strip() != title.lower().strip():
title = title + ' (' + record['title'][0] + ')'
journal = None
else:
title = record['title'][0]
journal = record.get('container-title', [None])[0]
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
results.append(
{
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
}
)
return results

@ -13,10 +13,12 @@ Definitions`_.
from urllib.parse import urlencode from urllib.parse import urlencode
from datetime import datetime from datetime import datetime
from typing import Optional
from lxml import html from lxml import html
from searx.utils import ( from searx.utils import (
eval_xpath, eval_xpath,
eval_xpath_getindex,
eval_xpath_list, eval_xpath_list,
extract_text, extract_text,
) )
@ -46,7 +48,7 @@ about = {
} }
# engine dependent config # engine dependent config
categories = ['science'] categories = ['science', 'scientific publications']
paging = True paging = True
language_support = True language_support = True
use_locale_domain = True use_locale_domain = True
@ -99,7 +101,43 @@ def request(query, params):
return params return params
def response(resp): def parse_gs_a(text: Optional[str]):
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return None, None, None, None
s_text = text.split(' - ')
authors = s_text[0].split(', ')
publisher = s_text[-1]
if len(s_text) != 3:
return authors, None, publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(', ')
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal = ', '.join(journal_year[0:-1])
if journal == '':
journal = None
else:
journal = None
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), '%Y')
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate
def response(resp): # pylint: disable=too-many-locals
"""Get response from google's search request""" """Get response from google's search request"""
results = [] results = []
@ -112,30 +150,53 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): for result in eval_xpath_list(dom, '//div[@data-cid]'):
title = extract_text(eval_xpath(result, './h3[1]//a')) title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title: if not title:
# this is a [ZITATION] block # this is a [ZITATION] block
continue continue
url = eval_xpath(result, './h3[1]//a/@href')[0]
content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''
pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
if pub_info:
content += "[%s]" % pub_info
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
if pub_type: if pub_type:
title = title + " " + pub_type pub_type = pub_type[1:-1].lower()
url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
)
if publisher in url:
publisher = None
# cited by
comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
# link to the html or pdf document
html_url = None
pdf_url = None
doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
results.append( results.append(
{ {
'template': 'paper.html',
'type': pub_type,
'url': url, 'url': url,
'title': title, 'title': title,
'authors': authors,
'publisher': publisher,
'journal': journal,
'publishedDate': publishedDate,
'content': content, 'content': content,
'comments': comments,
'html_url': html_url,
'pdf_url': pdf_url,
} }
) )

@ -3,11 +3,15 @@
PubMed (Scholar publications) PubMed (Scholar publications)
""" """
from flask_babel import gettext
from lxml import etree from lxml import etree
from datetime import datetime from datetime import datetime
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.network import get from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
# about # about
about = { about = {
@ -22,7 +26,7 @@ about = {
"results": 'XML', "results": 'XML',
} }
categories = ['science'] categories = ['science', 'scientific publications']
base_url = ( base_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
@ -63,46 +67,61 @@ def response(resp):
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
search_results_xml = get(retrieve_url_encoded).content search_results_response = get(retrieve_url_encoded).content
search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') search_results = etree.XML(search_results_response)
for entry in eval_xpath_list(search_results, '//PubmedArticle'):
for entry in search_results: medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
title = entry.xpath('.//Article/ArticleTitle')[0].text
pmid = entry.xpath('.//PMID')[0].text title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
pmid = eval_xpath_getindex(medline, './/PMID', 0).text
url = pubmed_url + pmid url = pubmed_url + pmid
content = extract_text(
try: eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
content = entry.xpath('.//Abstract/AbstractText')[0].text )
except: doi = extract_text(
content = gettext('No abstract is available for this publication.') eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
)
# If a doi is available, add it to the snipppet journal = extract_text(
try: eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text )
content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) issn = extract_text(
except: eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
pass )
authors = []
if len(content) > 300: for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
content = content[0:300] + "..." f = eval_xpath_getindex(author, './ForeName', 0, default=None)
# TODO: center snippet on query term l = eval_xpath_getindex(author, './LastName', 0, default=None)
f = '' if f is None else f.text
res_dict = {'url': url, 'title': title, 'content': content} l = '' if l is None else l.text
authors.append((f + ' ' + l).strip())
try:
publishedDate = datetime.strptime( res_dict = {
entry.xpath('.//DateCreated/Year')[0].text 'template': 'paper.html',
+ '-' 'url': url,
+ entry.xpath('.//DateCreated/Month')[0].text 'title': title,
+ '-' 'content': content,
+ entry.xpath('.//DateCreated/Day')[0].text, 'journal': journal,
'%Y-%m-%d', 'issn': [issn],
) 'authors': authors,
res_dict['publishedDate'] = publishedDate 'doi': doi,
except: }
pass
accepted_date = eval_xpath_getindex(
entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
)
if accepted_date is not None:
year = eval_xpath_getindex(accepted_date, './Year', 0)
month = eval_xpath_getindex(accepted_date, './Month', 0)
day = eval_xpath_getindex(accepted_date, './Day', 0)
try:
publishedDate = datetime.strptime(
year.text + '-' + month.text + '-' + day.text,
'%Y-%m-%d',
)
res_dict['publishedDate'] = publishedDate
except Exception as e:
print(e)
results.append(res_dict) results.append(res_dict)
return results return results

@ -6,6 +6,8 @@
from json import dumps, loads from json import dumps, loads
from datetime import datetime from datetime import datetime
from flask_babel import gettext
about = { about = {
"website": 'https://www.semanticscholar.org/', "website": 'https://www.semanticscholar.org/',
"wikidata_id": 'Q22908627', "wikidata_id": 'Q22908627',
@ -15,6 +17,7 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
categories = ['science', 'scientific publications']
paging = True paging = True
search_url = 'https://www.semanticscholar.org/api/1/search' search_url = 'https://www.semanticscholar.org/api/1/search'
paper_url = 'https://www.semanticscholar.org/paper' paper_url = 'https://www.semanticscholar.org/paper'
@ -47,9 +50,6 @@ def response(resp):
results = [] results = []
for result in res['results']: for result in res['results']:
item = {}
metadata = []
url = result.get('primaryPaperLink', {}).get('url') url = result.get('primaryPaperLink', {}).get('url')
if not url and result.get('links'): if not url and result.get('links'):
url = result.get('links')[0] url = result.get('links')[0]
@ -60,22 +60,47 @@ def response(resp):
if not url: if not url:
url = paper_url + '/%s' % result['id'] url = paper_url + '/%s' % result['id']
item['url'] = url # publishedDate
if 'pubDate' in result:
publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
else:
publishedDate = None
item['title'] = result['title']['text'] # authors
item['content'] = result['paperAbstract']['text'] authors = [author[0]['name'] for author in result.get('authors', [])]
metadata = result.get('fieldsOfStudy') or [] # pick for the first alternate link, but not from the crawler
venue = result.get('venue', {}).get('text') pdf_url = None
if venue: for doc in result.get('alternatePaperLinks', []):
metadata.append(venue) if doc['linkType'] != 'crawler':
if metadata: pdf_url = doc['url']
item['metadata'] = ', '.join(metadata) break
pubDate = result.get('pubDate') # comments
if pubDate: comments = None
item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") if 'citationStats' in result:
comments = gettext(
'{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
).format(
numCitations=result['citationStats']['numCitations'],
firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
)
results.append(item) results.append(
{
'template': 'paper.html',
'url': url,
'title': result['title']['text'],
'content': result['paperAbstract']['text'],
'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
'doi': result.get('doiInfo', {}).get('doi'),
'tags': result.get('fieldsOfStudy'),
'authors': authors,
'pdf_url': pdf_url,
'publishedDate': publishedDate,
'comments': comments,
}
)
return results return results

@ -19,7 +19,7 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
categories = ['science'] categories = ['science', 'scientific publications']
paging = True paging = True
nb_per_page = 10 nb_per_page = 10
api_key = 'unset' api_key = 'unset'
@ -41,32 +41,30 @@ def response(resp):
json_data = loads(resp.text) json_data = loads(resp.text)
for record in json_data['records']: for record in json_data['records']:
content = record['abstract'][0:500] content = record['abstract']
if len(record['abstract']) > len(content):
content += "..."
published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') published = datetime.strptime(record['publicationDate'], '%Y-%m-%d')
authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']]
metadata = [ tags = record.get('genre')
record[x] if isinstance(tags, str):
for x in [ tags = [tags]
'publicationName',
'identifier',
'contentType',
]
if record.get(x) is not None
]
metadata = ' / '.join(metadata)
if record.get('startingPage') and record.get('endingPage') is not None:
metadata += " (%(startingPage)s-%(endingPage)s)" % record
results.append( results.append(
{ {
'template': 'paper.html',
'title': record['title'], 'title': record['title'],
'url': record['url'][0]['value'].replace('http://', 'https://', 1), 'url': record['url'][0]['value'].replace('http://', 'https://', 1),
'type': record.get('contentType'),
'content': content, 'content': content,
'publishedDate': published, 'publishedDate': published,
'metadata': metadata, 'authors': authors,
'doi': record.get('doi'),
'journal': record.get('publicationName'),
'start_page': record.get('start_page'),
'end_page': record.get('end_page'),
'tags': tags,
'issn': [record.get('issn')],
'isbn': [record.get('isbn')],
'volume': record.get('volume') or None,
'number': record.get('number') or None,
} }
) )
return results return results

@ -43,6 +43,7 @@ CATEGORY_GROUPS = {
'REPOS': 'repos', 'REPOS': 'repos',
'SOFTWARE_WIKIS': 'software wikis', 'SOFTWARE_WIKIS': 'software wikis',
'WEB': 'web', 'WEB': 'web',
'SCIENTIFIC PUBLICATIONS': 'scientific publications',
} }
STYLE_NAMES = { STYLE_NAMES = {

@ -319,7 +319,6 @@ engines:
- name: arxiv - name: arxiv
engine: arxiv engine: arxiv
shortcut: arx shortcut: arx
categories: science
timeout: 4.0 timeout: 4.0
# tmp suspended: dh key too small # tmp suspended: dh key too small
@ -411,23 +410,9 @@ engines:
# api_key: 'unset' # api_key: 'unset'
- name: crossref - name: crossref
engine: json_engine engine: crossref
paging: true
search_url: https://search.crossref.org/dois?q={query}&page={pageno}
url_query: doi
title_query: title
title_html_to_text: true
content_query: fullCitation
content_html_to_text: true
categories: science
shortcut: cr shortcut: cr
about: timeout: 10
website: https://www.crossref.org/
wikidata_id: Q5188229
official_api_documentation: https://github.com/CrossRef/rest-api-doc
use_official_api: false
require_api_key: false
results: JSON
- name: yep - name: yep
engine: json_engine engine: json_engine
@ -1068,7 +1053,7 @@ engines:
title_query: metadata/oaf:entity/oaf:result/title/$ title_query: metadata/oaf:entity/oaf:result/title/$
content_query: metadata/oaf:entity/oaf:result/description/$ content_query: metadata/oaf:entity/oaf:result/description/$
content_html_to_text: true content_html_to_text: true
categories: science categories: "science"
shortcut: oad shortcut: oad
timeout: 5.0 timeout: 5.0
about: about:
@ -1198,7 +1183,6 @@ engines:
- name: pubmed - name: pubmed
engine: pubmed engine: pubmed
shortcut: pub shortcut: pub
categories: science
timeout: 3.0 timeout: 3.0
- name: pypi - name: pypi
@ -1346,7 +1330,6 @@ engines:
engine: semantic_scholar engine: semantic_scholar
disabled: true disabled: true
shortcut: se shortcut: se
categories: science
# Spotify needs API credentials # Spotify needs API credentials
# - name: spotify # - name: spotify
@ -1372,8 +1355,7 @@ engines:
# # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" # # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601"
# api_key: 'unset' # api_key: 'unset'
# shortcut: springer # shortcut: springer
# categories: science # timeout: 15.0
# timeout: 6.0
- name: startpage - name: startpage
engine: startpage engine: startpage

Loading…
Cancel
Save