From e90654f39bf6c598936770690c82537b16627334 Mon Sep 17 00:00:00 2001 From: Steve Kim Date: Wed, 17 May 2023 09:26:56 +0900 Subject: [PATCH] Added cleaning up the downloaded PDF files (#4601) ArxivAPIWrapper searches and downloads PDFs to get related information. But I found that it doesn't delete the downloaded file. The reason why this is a problem is that a lot of PDF files remain on the server. For example, one size is about 28M. So, I added a delete line because it's too big to maintain on the server. # Clean up downloaded PDF files - Changes: Added new line to delete downloaded file - Background: To get the information on arXiv's paper, ArxivAPIWrapper class downloads a PDF. It's a natural approach, but the wrapper retains a lot of PDF files on the server. - Problem: One size of PDFs is about 28M. It's too big to maintain on a small server like AWS. - Dependency: import os Thank you. --------- Co-authored-by: Dev 2049 --- langchain/utilities/arxiv.py | 106 +++++++++++++++++------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/langchain/utilities/arxiv.py b/langchain/utilities/arxiv.py index dcf80594bb..82ba07814e 100644 --- a/langchain/utilities/arxiv.py +++ b/langchain/utilities/arxiv.py @@ -1,5 +1,6 @@ """Util that calls Arxiv.""" import logging +import os from typing import Any, Dict, List from pydantic import BaseModel, Extra, root_validator @@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel): It uses only the most informative fields of article meta information. """ try: - docs = [ - f"Published: {result.updated.date()}\nTitle: {result.title}\n" - f"Authors: {', '.join(a.name for a in result.authors)}\n" - f"Summary: {result.summary}" - for result in self.arxiv_search( # type: ignore - query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results - ).results() - ] - return ( - "\n\n".join(docs)[: self.doc_content_chars_max] - if docs - else "No good Arxiv Result was found" - ) + results = self.arxiv_search( # type: ignore + query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results + ).results() except self.arxiv_exceptions as ex: return f"Arxiv exception: {ex}" + docs = [ + f"Published: {result.updated.date()}\nTitle: {result.title}\n" + f"Authors: {', '.join(a.name for a in result.authors)}\n" + f"Summary: {result.summary}" + for result in results + ] + if docs: + return "\n\n".join(docs)[: self.doc_content_chars_max] + else: + return "No good Arxiv Result was found" def load(self, query: str) -> List[Document]: """ @@ -98,52 +99,51 @@ class ArxivAPIWrapper(BaseModel): try: import fitz except ImportError: - raise ValueError( + raise ImportError( "PyMuPDF package not found, please install it with " "`pip install pymupdf`" ) try: - docs: List[Document] = [] - for result in self.arxiv_search( # type: ignore + results = self.arxiv_search( # type: ignore query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs - ).results(): - try: - doc_file_name: str = result.download_pdf() - with fitz.open(doc_file_name) as doc_file: - text: str = "".join(page.get_text() for page in doc_file) - add_meta = ( - { - "entry_id": result.entry_id, - "published_first_time": str(result.published.date()), - "comment": result.comment, - "journal_ref": result.journal_ref, - "doi": result.doi, - "primary_category": result.primary_category, - "categories": result.categories, - "links": [link.href for link in result.links], - } - if self.load_all_available_meta - else {} - ) - doc = Document( - page_content=text[: self.doc_content_chars_max], - metadata=( - { - "Published": str(result.updated.date()), - "Title": result.title, - "Authors": ", ".join( - a.name for a in result.authors - ), - "Summary": result.summary, - **add_meta, - } - ), - ) - docs.append(doc) - except FileNotFoundError as f_ex: - logger.debug(f_ex) - return docs + ).results() except self.arxiv_exceptions as ex: logger.debug("Error on arxiv: %s", ex) return [] + + docs: List[Document] = [] + for result in results: + try: + doc_file_name: str = result.download_pdf() + with fitz.open(doc_file_name) as doc_file: + text: str = "".join(page.get_text() for page in doc_file) + except FileNotFoundError as f_ex: + logger.debug(f_ex) + continue + if self.load_all_available_meta: + extra_metadata = { + "entry_id": result.entry_id, + "published_first_time": str(result.published.date()), + "comment": result.comment, + "journal_ref": result.journal_ref, + "doi": result.doi, + "primary_category": result.primary_category, + "categories": result.categories, + "links": [link.href for link in result.links], + } + else: + extra_metadata = {} + metadata = { + "Published": str(result.updated.date()), + "Title": result.title, + "Authors": ", ".join(a.name for a in result.authors), + "Summary": result.summary, + **extra_metadata, + } + doc = Document( + page_content=text[: self.doc_content_chars_max], metadata=metadata + ) + docs.append(doc) + os.remove(doc_file_name) + return docs