import json import logging import time import urllib.error import urllib.parse import urllib.request from typing import Any, Dict, Iterator, List from langchain_core.documents import Document from langchain_core.pydantic_v1 import BaseModel, root_validator logger = logging.getLogger(__name__) class PubMedAPIWrapper(BaseModel): """ Wrapper around PubMed API. This wrapper will use the PubMed API to conduct searches and fetch document summaries. By default, it will return the document summaries of the top-k results of an input search. Parameters: top_k_results: number of the top-scored document used for the PubMed tool MAX_QUERY_LENGTH: maximum length of the query. Default is 300 characters. doc_content_chars_max: maximum length of the document content. Content will be truncated if it exceeds this length. Default is 2000 characters. max_retry: maximum number of retries for a request. Default is 5. sleep_time: time to wait between retries. Default is 0.2 seconds. email: email address to be used for the PubMed API. """ parse: Any #: :meta private: base_url_esearch: str = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" ) base_url_efetch: str = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" max_retry: int = 5 sleep_time: float = 0.2 # Default values for the parameters top_k_results: int = 3 MAX_QUERY_LENGTH: int = 300 doc_content_chars_max: int = 2000 email: str = "your_email@example.com" @root_validator() def validate_environment(cls, values: Dict) -> Dict: """Validate that the python package exists in environment.""" try: import xmltodict values["parse"] = xmltodict.parse except ImportError: raise ImportError( "Could not import xmltodict python package. " "Please install it with `pip install xmltodict`." ) return values def run(self, query: str) -> str: """ Run PubMed search and get the article meta information. See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch It uses only the most informative fields of article meta information. """ try: # Retrieve the top-k results for the query docs = [ f"Published: {result['Published']}\n" f"Title: {result['Title']}\n" f"Copyright Information: {result['Copyright Information']}\n" f"Summary::\n{result['Summary']}" for result in self.load(query[: self.MAX_QUERY_LENGTH]) ] # Join the results and limit the character count return ( "\n\n".join(docs)[: self.doc_content_chars_max] if docs else "No good PubMed Result was found" ) except Exception as ex: return f"PubMed exception: {ex}" def lazy_load(self, query: str) -> Iterator[dict]: """ Search PubMed for documents matching the query. Return an iterator of dictionaries containing the document metadata. """ url = ( self.base_url_esearch + "db=pubmed&term=" + str({urllib.parse.quote(query)}) + f"&retmode=json&retmax={self.top_k_results}&usehistory=y" ) result = urllib.request.urlopen(url) text = result.read().decode("utf-8") json_text = json.loads(text) webenv = json_text["esearchresult"]["webenv"] for uid in json_text["esearchresult"]["idlist"]: yield self.retrieve_article(uid, webenv) def load(self, query: str) -> List[dict]: """ Search PubMed for documents matching the query. Return a list of dictionaries containing the document metadata. """ return list(self.lazy_load(query)) def _dict2document(self, doc: dict) -> Document: summary = doc.pop("Summary") return Document(page_content=summary, metadata=doc) def lazy_load_docs(self, query: str) -> Iterator[Document]: for d in self.lazy_load(query=query): yield self._dict2document(d) def load_docs(self, query: str) -> List[Document]: return list(self.lazy_load_docs(query=query)) def retrieve_article(self, uid: str, webenv: str) -> dict: url = ( self.base_url_efetch + "db=pubmed&retmode=xml&id=" + uid + "&webenv=" + webenv ) retry = 0 while True: try: result = urllib.request.urlopen(url) break except urllib.error.HTTPError as e: if e.code == 429 and retry < self.max_retry: # Too Many Requests errors # wait for an exponentially increasing amount of time print( # noqa: T201 f"Too Many Requests, " f"waiting for {self.sleep_time:.2f} seconds..." ) time.sleep(self.sleep_time) self.sleep_time *= 2 retry += 1 else: raise e xml_text = result.read().decode("utf-8") text_dict = self.parse(xml_text) return self._parse_article(uid, text_dict) def _parse_article(self, uid: str, text_dict: dict) -> dict: try: ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][ "Article" ] except KeyError: ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"] abstract_text = ar.get("Abstract", {}).get("AbstractText", []) summaries = [ f"{txt['@Label']}: {txt['#text']}" for txt in abstract_text if "#text" in txt and "@Label" in txt ] summary = ( "\n".join(summaries) if summaries else ( abstract_text if isinstance(abstract_text, str) else ( "\n".join(str(value) for value in abstract_text.values()) if isinstance(abstract_text, dict) else "No abstract available" ) ) ) a_d = ar.get("ArticleDate", {}) pub_date = "-".join( [a_d.get("Year", ""), a_d.get("Month", ""), a_d.get("Day", "")] ) return { "uid": uid, "Title": ar.get("ArticleTitle", ""), "Published": pub_date, "Copyright Information": ar.get("Abstract", {}).get( "CopyrightInformation", "" ), "Summary": summary, }