From d78f418c0d8da14f29c343e171f51d991adfd173 Mon Sep 17 00:00:00 2001 From: Florian <133657389+FlorianH5@users.noreply.github.com> Date: Fri, 6 Oct 2023 03:56:46 +0200 Subject: [PATCH] Extract abstracts from Pubmed articles, even if they have no extra label (#10245) ### Description This pull request involves modifications to the extraction method for abstracts/summaries within the PubMed utility. A condition has been added to verify the presence of unlabeled abstracts. Now an abstract will be extracted even if it does not have a subtitle. In addition, the extraction of the abstract was extended to books. ### Issue The PubMed utility occasionally returns an empty result when extracting abstracts from articles, despite the presence of an abstract for the paper on PubMed. This issue arises due to the varying structure of articles; some articles follow a "subtitle/label: text" format, while others do not include subtitles in their abstracts. An example of the latter case can be found at: [https://pubmed.ncbi.nlm.nih.gov/37666905/](url) --------- Co-authored-by: Bagatur --- libs/langchain/langchain/utilities/pubmed.py | 31 +++++--- .../utilities/test_pubmed.py | 71 +++++++++++++++++-- 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/libs/langchain/langchain/utilities/pubmed.py b/libs/langchain/langchain/utilities/pubmed.py index 2bb26b7a41..9400d2082a 100644 --- a/libs/langchain/langchain/utilities/pubmed.py +++ b/libs/langchain/langchain/utilities/pubmed.py @@ -158,15 +158,30 @@ class PubMedAPIWrapper(BaseModel): return self._parse_article(uid, text_dict) def _parse_article(self, uid: str, text_dict: dict) -> dict: - ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][ - "Article" - ] - summary = "\n".join( - [ - f"{txt['@Label']}: {txt['#text']}" - for txt in ar.get("Abstract", {}).get("AbstractText", []) - if "#text" in txt and "@Label" in txt + try: + ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][ + "Article" ] + except KeyError: + ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"] + abstract_text = ar.get("Abstract", {}).get("AbstractText", []) + summaries = [ + f"{txt['@Label']}: {txt['#text']}" + for txt in abstract_text + if "#text" in txt and "@Label" in txt + ] + summary = ( + "\n".join(summaries) + if summaries + else ( + abstract_text + if isinstance(abstract_text, str) + else ( + "\n".join(str(value) for value in abstract_text.values()) + if isinstance(abstract_text, dict) + else "No abstract available" + ) + ) ) a_d = ar.get("ArticleDate", {}) pub_date = "-".join( diff --git a/libs/langchain/tests/integration_tests/utilities/test_pubmed.py b/libs/langchain/tests/integration_tests/utilities/test_pubmed.py index cda90258d3..d015cb06fd 100644 --- a/libs/langchain/tests/integration_tests/utilities/test_pubmed.py +++ b/libs/langchain/tests/integration_tests/utilities/test_pubmed.py @@ -20,8 +20,16 @@ def api_client() -> PubMedAPIWrapper: def test_run_success(api_client: PubMedAPIWrapper) -> None: """Test that returns the correct answer""" - output = api_client.run("chatgpt") - assert "Performance of ChatGPT on the Situational Judgement Test-A" in output + search_string = ( + "Examining the Validity of ChatGPT in Identifying " + "Relevant Nephrology Literature" + ) + output = api_client.run(search_string) + test_string = ( + "Examining the Validity of ChatGPT in Identifying " + "Relevant Nephrology Literature: Findings and Implications" + ) + assert test_string in output assert len(output) == api_client.doc_content_chars_max @@ -32,6 +40,53 @@ def test_run_returns_no_result(api_client: PubMedAPIWrapper) -> None: assert "No good PubMed Result was found" == output +def test_retrieve_article_returns_book_abstract(api_client: PubMedAPIWrapper) -> None: + """Test that returns the excerpt of a book.""" + + output_nolabel = api_client.retrieve_article("25905357", "") + output_withlabel = api_client.retrieve_article("29262144", "") + test_string_nolabel = ( + "Osteoporosis is a multifactorial disorder associated with low bone mass and " + "enhanced skeletal fragility. Although" + ) + assert test_string_nolabel in output_nolabel["Summary"] + assert ( + "Wallenberg syndrome was first described in 1808 by Gaspard Vieusseux. However," + in output_withlabel["Summary"] + ) + + +def test_retrieve_article_returns_article_abstract( + api_client: PubMedAPIWrapper, +) -> None: + """Test that returns the abstract of an article.""" + + output_nolabel = api_client.retrieve_article("37666905", "") + output_withlabel = api_client.retrieve_article("37666551", "") + test_string_nolabel = ( + "This work aims to: (1) Provide maximal hand force data on six different " + "grasp types for healthy subjects; (2) detect grasp types with maximal " + "force significantly affected by hand osteoarthritis (HOA) in women; (3) " + "look for predictors to detect HOA from the maximal forces using discriminant " + "analyses." + ) + assert test_string_nolabel in output_nolabel["Summary"] + test_string_withlabel = ( + "OBJECTIVES: To assess across seven hospitals from six different countries " + "the extent to which the COVID-19 pandemic affected the volumes of orthopaedic " + "hospital admissions and patient outcomes for non-COVID-19 patients admitted " + "for orthopaedic care." + ) + assert test_string_withlabel in output_withlabel["Summary"] + + +def test_retrieve_article_no_abstract_available(api_client: PubMedAPIWrapper) -> None: + """Test that returns 'No abstract available'.""" + + output = api_client.retrieve_article("10766884", "") + assert "No abstract available" == output["Summary"] + + def assert_docs(docs: List[Document]) -> None: for doc in docs: assert doc.metadata @@ -87,8 +142,16 @@ def _load_pubmed_from_universal_entry(**kwargs: Any) -> BaseTool: def test_load_pupmed_from_universal_entry() -> None: pubmed_tool = _load_pubmed_from_universal_entry() - output = pubmed_tool("chatgpt") - assert "Performance of ChatGPT on the Situational Judgement Test-A" in output + search_string = ( + "Examining the Validity of ChatGPT in Identifying " + "Relevant Nephrology Literature" + ) + output = pubmed_tool(search_string) + test_string = ( + "Examining the Validity of ChatGPT in Identifying " + "Relevant Nephrology Literature: Findings and Implications" + ) + assert test_string in output def test_load_pupmed_from_universal_entry_with_params() -> None: