Harrison/pubmed integration (#5664)

Co-authored-by: younis basher <71520361+younis-ba@users.noreply.github.com>
Co-authored-by: Younis Bashir <younis@omicmd.com>
searx_updates
Harrison Chase 12 months ago committed by GitHub
parent 9921f8cc3a
commit ad09367a92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,86 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "64f20f38",
"metadata": {},
"source": [
"# PubMed Tool\n",
"\n",
"This notebook goes over how to use PubMed as a tool\n",
"\n",
"PubMed® comprises more than 35 million citations for biomedical literature from MEDLINE, life science journals, and online books. Citations may include links to full text content from PubMed Central and publisher web sites."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c80b9273",
"metadata": {},
"outputs": [],
"source": [
"from langchain.tools import PubmedQueryRun"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f203c965",
"metadata": {},
"outputs": [],
"source": [
"tool = PubmedQueryRun()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "baee7a2a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Published: <Year>2023</Year><Month>May</Month><Day>31</Day>\\nTitle: Dermatology in the wake of an AI revolution: who gets a say?\\nSummary: \\n\\nPublished: <Year>2023</Year><Month>May</Month><Day>30</Day>\\nTitle: What is ChatGPT and what do we do with it? Implications of the age of AI for nursing and midwifery practice and education: An editorial.\\nSummary: \\n\\nPublished: <Year>2023</Year><Month>Jun</Month><Day>02</Day>\\nTitle: The Impact of ChatGPT on the Nursing Profession: Revolutionizing Patient Care and Education.\\nSummary: The nursing field has undergone notable changes over time and is projected to undergo further modifications in the future, owing to the advent of sophisticated technologies and growing healthcare needs. The advent of ChatGPT, an AI-powered language model, is expected to exert a significant influence on the nursing profession, specifically in the domains of patient care and instruction. The present article delves into the ramifications of ChatGPT within the nursing domain and accentuates its capacity and constraints to transform the discipline.'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tool.run(\"chatgpt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "965903ba",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -0,0 +1,80 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "3df0dcf8",
"metadata": {},
"source": [
"# PubMed Retriever\n",
"\n",
"This notebook goes over how to use PubMed as a retriever\n",
"\n",
"PubMed® comprises more than 35 million citations for biomedical literature from MEDLINE, life science journals, and online books. Citations may include links to full text content from PubMed Central and publisher web sites."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "aecaff63",
"metadata": {},
"outputs": [],
"source": [
"from langchain.retrievers import PubMedRetriever"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f2f7e8d3",
"metadata": {},
"outputs": [],
"source": [
"retriever = PubMedRetriever()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ed115aa1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='', metadata={'uid': '37268021', 'title': 'Dermatology in the wake of an AI revolution: who gets a say?', 'pub_date': '<Year>2023</Year><Month>May</Month><Day>31</Day>'}),\n",
" Document(page_content='', metadata={'uid': '37267643', 'title': 'What is ChatGPT and what do we do with it? Implications of the age of AI for nursing and midwifery practice and education: An editorial.', 'pub_date': '<Year>2023</Year><Month>May</Month><Day>30</Day>'}),\n",
" Document(page_content='The nursing field has undergone notable changes over time and is projected to undergo further modifications in the future, owing to the advent of sophisticated technologies and growing healthcare needs. The advent of ChatGPT, an AI-powered language model, is expected to exert a significant influence on the nursing profession, specifically in the domains of patient care and instruction. The present article delves into the ramifications of ChatGPT within the nursing domain and accentuates its capacity and constraints to transform the discipline.', metadata={'uid': '37266721', 'title': 'The Impact of ChatGPT on the Nursing Profession: Revolutionizing Patient Care and Education.', 'pub_date': '<Year>2023</Year><Month>Jun</Month><Day>02</Day>'})]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"retriever.get_relevant_documents(\"chatgpt\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -1,7 +1,6 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "9fc6205b",
"metadata": {},
@ -14,7 +13,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "51489529-5dcd-4b86-bda6-de0a39d8ffd1",
"metadata": {},
@ -23,7 +21,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "1435c804-069d-4ade-9a7b-006b97b767c1",
"metadata": {},
@ -44,7 +41,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6c15470b-a16b-4e0d-bc6a-6998bafbb5a4",
"metadata": {},
@ -58,7 +54,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "ae3c3d16",
"metadata": {},
@ -67,7 +62,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6fafb73b-d6ec-4822-b161-edf0aaf5224a",
"metadata": {},
@ -151,7 +145,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2670363b-3806-4c7e-b14d-90a4d5d2a200",
"metadata": {},
@ -273,7 +266,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -14,6 +14,7 @@ from langchain.chains.llm_math.base import LLMMathChain
from langchain.chains.pal.base import PALChain
from langchain.requests import TextRequestsWrapper
from langchain.tools.arxiv.tool import ArxivQueryRun
from langchain.tools.pubmed.tool import PubmedQueryRun
from langchain.tools.base import BaseTool
from langchain.tools.bing_search.tool import BingSearchRun
from langchain.tools.ddg_search.tool import DuckDuckGoSearchRun
@ -37,6 +38,7 @@ from langchain.tools.wikipedia.tool import WikipediaQueryRun
from langchain.tools.wolfram_alpha.tool import WolframAlphaQueryRun
from langchain.tools.openweathermap.tool import OpenWeatherMapQueryRun
from langchain.utilities import ArxivAPIWrapper
from langchain.utilities import PubMedAPIWrapper
from langchain.utilities.bing_search import BingSearchAPIWrapper
from langchain.utilities.duckduckgo_search import DuckDuckGoSearchAPIWrapper
from langchain.utilities.google_search import GoogleSearchAPIWrapper
@ -198,6 +200,10 @@ def _get_arxiv(**kwargs: Any) -> BaseTool:
return ArxivQueryRun(api_wrapper=ArxivAPIWrapper(**kwargs))
def _get_pupmed(**kwargs: Any) -> BaseTool:
return PubmedQueryRun(api_wrapper=PubMedAPIWrapper(**kwargs))
def _get_google_serper(**kwargs: Any) -> BaseTool:
return GoogleSerperRun(api_wrapper=GoogleSerperAPIWrapper(**kwargs))
@ -302,6 +308,10 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st
_get_arxiv,
["top_k_results", "load_max_docs", "load_all_available_meta"],
),
"pupmed": (
_get_pupmed,
["top_k_results", "load_max_docs", "load_all_available_meta"],
),
"human": (_get_human_tool, ["prompt_func", "input_func"]),
"awslambda": (
_get_lambda_api,

@ -7,6 +7,7 @@ from langchain.retrievers.elastic_search_bm25 import ElasticSearchBM25Retriever
from langchain.retrievers.knn import KNNRetriever
from langchain.retrievers.metal import MetalRetriever
from langchain.retrievers.pinecone_hybrid_search import PineconeHybridSearchRetriever
from langchain.retrievers.pupmed import PubMedRetriever
from langchain.retrievers.remote_retriever import RemoteLangChainRetriever
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.svm import SVMRetriever
@ -21,6 +22,7 @@ from langchain.retrievers.zep import ZepRetriever
__all__ = [
"ArxivRetriever",
"PubMedRetriever",
"AzureCognitiveSearchRetriever",
"ChatGPTPluginRetriever",
"ContextualCompressionRetriever",

@ -0,0 +1,18 @@
from typing import List
from langchain.schema import BaseRetriever, Document
from langchain.utilities.pupmed import PubMedAPIWrapper
class PubMedRetriever(BaseRetriever, PubMedAPIWrapper):
"""
It is effectively a wrapper for PubMedAPIWrapper.
It wraps load() to get_relevant_documents().
It uses all PubMedAPIWrapper arguments without any change.
"""
def get_relevant_documents(self, query: str) -> List[Document]:
return self.load_docs(query=query)
async def aget_relevant_documents(self, query: str) -> List[Document]:
raise NotImplementedError

@ -48,6 +48,7 @@ from langchain.tools.powerbi.tool import (
ListPowerBITool,
QueryPowerBITool,
)
from langchain.tools.pubmed.tool import PubmedQueryRun
from langchain.tools.scenexplain.tool import SceneXplainTool
from langchain.tools.shell.tool import ShellTool
from langchain.tools.steamship_image_generation import SteamshipImageGenerationTool
@ -120,4 +121,5 @@ __all__ = [
"tool",
"YouTubeSearchTool",
"BraveSearch",
"PubmedQueryRun",
]

@ -0,0 +1 @@
"""PubMed API toolkit."""

@ -0,0 +1,43 @@
"""Tool for the Pubmed API."""
from typing import Optional
from pydantic import Field
from langchain.callbacks.manager import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from langchain.tools.base import BaseTool
from langchain.utilities.pupmed import PubMedAPIWrapper
class PubmedQueryRun(BaseTool):
"""Tool that adds the capability to search using the PubMed API."""
name = "PubMed"
description = (
"A wrapper around PubMed.org "
"Useful for when you need to answer questions about Physics, Mathematics, "
"Computer Science, Quantitative Biology, Quantitative Finance, Statistics, "
"Electrical Engineering, and Economics "
"from scientific articles on PubMed.org. "
"Input should be a search query."
)
api_wrapper: PubMedAPIWrapper = Field(default_factory=PubMedAPIWrapper)
def _run(
self,
query: str,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> str:
"""Use the Arxiv tool."""
return self.api_wrapper.run(query)
async def _arun(
self,
query: str,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> str:
"""Use the PubMed tool asynchronously."""
raise NotImplementedError("PubMedAPIWrapper does not support async")

@ -13,6 +13,7 @@ from langchain.utilities.graphql import GraphQLAPIWrapper
from langchain.utilities.metaphor_search import MetaphorSearchAPIWrapper
from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper
from langchain.utilities.powerbi import PowerBIDataset
from langchain.utilities.pupmed import PubMedAPIWrapper
from langchain.utilities.python import PythonREPL
from langchain.utilities.searx_search import SearxSearchWrapper
from langchain.utilities.serpapi import SerpAPIWrapper
@ -24,6 +25,7 @@ from langchain.utilities.wolfram_alpha import WolframAlphaAPIWrapper
__all__ = [
"ApifyWrapper",
"ArxivAPIWrapper",
"PubMedAPIWrapper",
"BashProcess",
"BingSearchAPIWrapper",
"DuckDuckGoSearchAPIWrapper",

@ -0,0 +1,171 @@
import json
import logging
import time
import urllib.error
import urllib.request
from typing import List
from pydantic import BaseModel, Extra
from langchain.schema import Document
logger = logging.getLogger(__name__)
class PubMedAPIWrapper(BaseModel):
"""
Wrapper around PubMed API.
This wrapper will use the PubMed API to conduct searches and fetch
document summaries. By default, it will return the document summaries
of the top-k results of an input search.
Parameters:
top_k_results: number of the top-scored document used for the PubMed tool
load_max_docs: a limit to the number of loaded documents
load_all_available_meta:
if True: the `metadata` of the loaded Documents gets all available meta info
(see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch)
if False: the `metadata` gets only the most informative fields.
"""
base_url_esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
base_url_efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
max_retry = 5
sleep_time = 0.2
# Default values for the parameters
top_k_results: int = 3
load_max_docs: int = 25
ARXIV_MAX_QUERY_LENGTH = 300
doc_content_chars_max: int = 2000
load_all_available_meta: bool = False
email: str = "your_email@example.com"
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def run(self, query: str) -> str:
"""
Run PubMed search and get the article meta information.
See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
It uses only the most informative fields of article meta information.
"""
try:
# Retrieve the top-k results for the query
docs = [
f"Published: {result['pub_date']}\nTitle: {result['title']}\n"
f"Summary: {result['summary']}"
for result in self.load(query[: self.ARXIV_MAX_QUERY_LENGTH])
]
# Join the results and limit the character count
return (
"\n\n".join(docs)[: self.doc_content_chars_max]
if docs
else "No good PubMed Result was found"
)
except Exception as ex:
return f"PubMed exception: {ex}"
def load(self, query: str) -> List[dict]:
"""
Search PubMed for documents matching the query.
Return a list of dictionaries containing the document metadata.
"""
url = (
self.base_url_esearch
+ "db=pubmed&term="
+ str({urllib.parse.quote(query)})
+ f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
)
result = urllib.request.urlopen(url)
text = result.read().decode("utf-8")
json_text = json.loads(text)
articles = []
webenv = json_text["esearchresult"]["webenv"]
for uid in json_text["esearchresult"]["idlist"]:
article = self.retrieve_article(uid, webenv)
articles.append(article)
# Convert the list of articles to a JSON string
return articles
def _transform_doc(self, doc: dict) -> Document:
summary = doc.pop("summary")
return Document(page_content=summary, metadata=doc)
def load_docs(self, query: str) -> List[Document]:
document_dicts = self.load(query=query)
return [self._transform_doc(d) for d in document_dicts]
def retrieve_article(self, uid: str, webenv: str) -> dict:
url = (
self.base_url_efetch
+ "db=pubmed&retmode=xml&id="
+ uid
+ "&webenv="
+ webenv
)
retry = 0
while True:
try:
result = urllib.request.urlopen(url)
break
except urllib.error.HTTPError as e:
if e.code == 429 and retry < self.max_retry:
# Too Many Requests error
# wait for an exponentially increasing amount of time
print(
f"Too Many Requests, "
f"waiting for {self.sleep_time:.2f} seconds..."
)
time.sleep(self.sleep_time)
self.sleep_time *= 2
retry += 1
else:
raise e
xml_text = result.read().decode("utf-8")
# Get title
title = ""
if "<ArticleTitle>" in xml_text and "</ArticleTitle>" in xml_text:
start_tag = "<ArticleTitle>"
end_tag = "</ArticleTitle>"
title = xml_text[
xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
]
# Get abstract
abstract = ""
if "<AbstractText>" in xml_text and "</AbstractText>" in xml_text:
start_tag = "<AbstractText>"
end_tag = "</AbstractText>"
abstract = xml_text[
xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
]
# Get publication date
pub_date = ""
if "<PubDate>" in xml_text and "</PubDate>" in xml_text:
start_tag = "<PubDate>"
end_tag = "</PubDate>"
pub_date = xml_text[
xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
]
# Return article as dictionary
article = {
"uid": uid,
"title": title,
"summary": abstract,
"pub_date": pub_date,
}
return article

@ -0,0 +1,50 @@
"""Integration test for PubMed API Wrapper."""
from typing import List
import pytest
from langchain.retrievers import PubMedRetriever
from langchain.schema import Document
@pytest.fixture
def retriever() -> PubMedRetriever:
return PubMedRetriever()
def assert_docs(docs: List[Document], all_meta: bool = False) -> None:
for doc in docs:
assert doc.page_content
assert doc.metadata
main_meta = {"Published", "Title", "Authors", "Summary"}
assert set(doc.metadata).issuperset(main_meta)
if all_meta:
assert len(set(doc.metadata)) > len(main_meta)
else:
assert len(set(doc.metadata)) == len(main_meta)
def test_load_success(retriever: PubMedRetriever) -> None:
docs = retriever.get_relevant_documents(query="1605.08386")
assert len(docs) == 1
assert_docs(docs, all_meta=False)
def test_load_success_all_meta(retriever: PubMedRetriever) -> None:
retriever.load_all_available_meta = True
retriever.load_max_docs = 2
docs = retriever.get_relevant_documents(query="ChatGPT")
assert len(docs) > 1
assert_docs(docs, all_meta=True)
def test_load_success_init_args() -> None:
retriever = PubMedRetriever(load_max_docs=1, load_all_available_meta=True)
docs = retriever.get_relevant_documents(query="ChatGPT")
assert len(docs) == 1
assert_docs(docs, all_meta=True)
def test_load_no_result(retriever: PubMedRetriever) -> None:
docs = retriever.get_relevant_documents("1605.08386WWW")
assert not docs

@ -0,0 +1,111 @@
"""Integration test for PubMed API Wrapper."""
from typing import Any, List
import pytest
from langchain.agents.load_tools import load_tools
from langchain.schema import Document
from langchain.tools.base import BaseTool
from langchain.utilities import PubMedAPIWrapper
@pytest.fixture
def api_client() -> PubMedAPIWrapper:
return PubMedAPIWrapper()
def test_run_success(api_client: PubMedAPIWrapper) -> None:
"""Test that returns the correct answer"""
output = api_client.run("1605.08386")
assert "Heat-bath random walks with Markov bases" in output
def test_run_returns_several_docs(api_client: PubMedAPIWrapper) -> None:
"""Test that returns several docs"""
output = api_client.run("Caprice Stanley")
assert "On Mixing Behavior of a Family of Random Walks" in output
def test_run_returns_no_result(api_client: PubMedAPIWrapper) -> None:
"""Test that gives no result."""
output = api_client.run("1605.08386WWW")
assert "No good PubMed Result was found" == output
def assert_docs(docs: List[Document]) -> None:
for doc in docs:
assert doc.page_content
assert doc.metadata
assert set(doc.metadata) == {"Published", "Title", "Authors", "Summary"}
def test_load_success(api_client: PubMedAPIWrapper) -> None:
"""Test that returns one document"""
docs = api_client.load_docs("1605.08386")
assert len(docs) == 1
assert_docs(docs)
def test_load_returns_no_result(api_client: PubMedAPIWrapper) -> None:
"""Test that returns no docs"""
docs = api_client.load("1605.08386WWW")
assert len(docs) == 0
def test_load_returns_limited_docs() -> None:
"""Test that returns several docs"""
expected_docs = 2
api_client = PubMedAPIWrapper(load_max_docs=expected_docs)
docs = api_client.load_docs("ChatGPT")
assert len(docs) == expected_docs
assert_docs(docs)
def test_load_returns_full_set_of_metadata() -> None:
"""Test that returns several docs"""
api_client = PubMedAPIWrapper(load_max_docs=1, load_all_available_meta=True)
docs = api_client.load_docs("ChatGPT")
assert len(docs) == 1
for doc in docs:
assert doc.page_content
assert doc.metadata
assert set(doc.metadata).issuperset(
{"Published", "Title", "Authors", "Summary"}
)
print(doc.metadata)
assert len(set(doc.metadata)) > 4
def _load_pubmed_from_universal_entry(**kwargs: Any) -> BaseTool:
tools = load_tools(["pupmed"], **kwargs)
assert len(tools) == 1, "loaded more than 1 tool"
return tools[0]
def test_load_pupmed_from_universal_entry() -> None:
pupmed_tool = _load_pubmed_from_universal_entry()
output = pupmed_tool("Caprice Stanley")
assert (
"On Mixing Behavior of a Family of Random Walks" in output
), "failed to fetch a valid result"
def test_load_pupmed_from_universal_entry_with_params() -> None:
params = {
"top_k_results": 1,
"load_max_docs": 10,
"load_all_available_meta": True,
}
pupmed_tool = _load_pubmed_from_universal_entry(**params)
assert isinstance(pupmed_tool, PubMedAPIWrapper)
wp = pupmed_tool.api_wrapper
assert wp.top_k_results == 1, "failed to assert top_k_results"
assert wp.load_max_docs == 10, "failed to assert load_max_docs"
assert (
wp.load_all_available_meta is True
), "failed to assert load_all_available_meta"

@ -61,6 +61,7 @@ _EXPECTED = [
"tool",
"YouTubeSearchTool",
"BraveSearch",
"PubmedQueryRun",
]

Loading…
Cancel
Save