Extract abstracts from Pubmed articles, even if they have no extra label (#10245)

### Description
This pull request involves modifications to the extraction method for
abstracts/summaries within the PubMed utility. A condition has been
added to verify the presence of unlabeled abstracts. Now an abstract
will be extracted even if it does not have a subtitle. In addition, the
extraction of the abstract was extended to books.

### Issue
The PubMed utility occasionally returns an empty result when extracting
abstracts from articles, despite the presence of an abstract for the
paper on PubMed. This issue arises due to the varying structure of
articles; some articles follow a "subtitle/label: text" format, while
others do not include subtitles in their abstracts. An example of the
latter case can be found at:
[https://pubmed.ncbi.nlm.nih.gov/37666905/](url)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/11493/head
Florian 10 months ago committed by GitHub
parent fd9da60aea
commit d78f418c0d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -158,15 +158,30 @@ class PubMedAPIWrapper(BaseModel):
return self._parse_article(uid, text_dict)
def _parse_article(self, uid: str, text_dict: dict) -> dict:
ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
"Article"
]
summary = "\n".join(
[
f"{txt['@Label']}: {txt['#text']}"
for txt in ar.get("Abstract", {}).get("AbstractText", [])
if "#text" in txt and "@Label" in txt
try:
ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
"Article"
]
except KeyError:
ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"]
abstract_text = ar.get("Abstract", {}).get("AbstractText", [])
summaries = [
f"{txt['@Label']}: {txt['#text']}"
for txt in abstract_text
if "#text" in txt and "@Label" in txt
]
summary = (
"\n".join(summaries)
if summaries
else (
abstract_text
if isinstance(abstract_text, str)
else (
"\n".join(str(value) for value in abstract_text.values())
if isinstance(abstract_text, dict)
else "No abstract available"
)
)
)
a_d = ar.get("ArticleDate", {})
pub_date = "-".join(

@ -20,8 +20,16 @@ def api_client() -> PubMedAPIWrapper:
def test_run_success(api_client: PubMedAPIWrapper) -> None:
"""Test that returns the correct answer"""
output = api_client.run("chatgpt")
assert "Performance of ChatGPT on the Situational Judgement Test-A" in output
search_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature"
)
output = api_client.run(search_string)
test_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature: Findings and Implications"
)
assert test_string in output
assert len(output) == api_client.doc_content_chars_max
@ -32,6 +40,53 @@ def test_run_returns_no_result(api_client: PubMedAPIWrapper) -> None:
assert "No good PubMed Result was found" == output
def test_retrieve_article_returns_book_abstract(api_client: PubMedAPIWrapper) -> None:
"""Test that returns the excerpt of a book."""
output_nolabel = api_client.retrieve_article("25905357", "")
output_withlabel = api_client.retrieve_article("29262144", "")
test_string_nolabel = (
"Osteoporosis is a multifactorial disorder associated with low bone mass and "
"enhanced skeletal fragility. Although"
)
assert test_string_nolabel in output_nolabel["Summary"]
assert (
"Wallenberg syndrome was first described in 1808 by Gaspard Vieusseux. However,"
in output_withlabel["Summary"]
)
def test_retrieve_article_returns_article_abstract(
api_client: PubMedAPIWrapper,
) -> None:
"""Test that returns the abstract of an article."""
output_nolabel = api_client.retrieve_article("37666905", "")
output_withlabel = api_client.retrieve_article("37666551", "")
test_string_nolabel = (
"This work aims to: (1) Provide maximal hand force data on six different "
"grasp types for healthy subjects; (2) detect grasp types with maximal "
"force significantly affected by hand osteoarthritis (HOA) in women; (3) "
"look for predictors to detect HOA from the maximal forces using discriminant "
"analyses."
)
assert test_string_nolabel in output_nolabel["Summary"]
test_string_withlabel = (
"OBJECTIVES: To assess across seven hospitals from six different countries "
"the extent to which the COVID-19 pandemic affected the volumes of orthopaedic "
"hospital admissions and patient outcomes for non-COVID-19 patients admitted "
"for orthopaedic care."
)
assert test_string_withlabel in output_withlabel["Summary"]
def test_retrieve_article_no_abstract_available(api_client: PubMedAPIWrapper) -> None:
"""Test that returns 'No abstract available'."""
output = api_client.retrieve_article("10766884", "")
assert "No abstract available" == output["Summary"]
def assert_docs(docs: List[Document]) -> None:
for doc in docs:
assert doc.metadata
@ -87,8 +142,16 @@ def _load_pubmed_from_universal_entry(**kwargs: Any) -> BaseTool:
def test_load_pupmed_from_universal_entry() -> None:
pubmed_tool = _load_pubmed_from_universal_entry()
output = pubmed_tool("chatgpt")
assert "Performance of ChatGPT on the Situational Judgement Test-A" in output
search_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature"
)
output = pubmed_tool(search_string)
test_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature: Findings and Implications"
)
assert test_string in output
def test_load_pupmed_from_universal_entry_with_params() -> None:

Loading…
Cancel
Save