Added cleaning up the downloaded PDF files (#4601)

ArxivAPIWrapper searches and downloads PDFs to get related information. But I found that it doesn't delete the downloaded file. The reason why this is a problem is that a lot of PDF files remain on the server. For example, one size is about 28M. So, I added a delete line because it's too big to maintain on the server. # Clean up downloaded PDF files - Changes: Added new line to delete downloaded file - Background: To get the information on arXiv's paper, ArxivAPIWrapper class downloads a PDF. It's a natural approach, but the wrapper retains a lot of PDF files on the server. - Problem: One size of PDFs is about 28M. It's too big to maintain on a small server like AWS. - Dependency: import os Thank you. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
1 year ago · e90654f39b
parent 6fbd5e837f
commit e90654f39b
1 changed files with 53 additions and 53 deletions
--- a/langchain/utilities/arxiv.py
+++ b/langchain/utilities/arxiv.py
@ -1,5 +1,6 @@
 """Util that calls Arxiv."""
 import logging
 import os
 from typing import Any, Dict, List
 from pydantic import BaseModel, Extra, root_validator
@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
        It uses only the most informative fields of article meta information.
        """
        try:
            results = self.arxiv_search(  # type: ignore
                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
            ).results()
        except self.arxiv_exceptions as ex:
            return f"Arxiv exception: {ex}"
        docs = [
            f"Published: {result.updated.date()}\nTitle: {result.title}\n"
            f"Authors: {', '.join(a.name for a in result.authors)}\n"
            f"Summary: {result.summary}"
-                for result in self.arxiv_search(  # type: ignore
+            for result in results
                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
                ).results()
        ]
-            return (
+        if docs:
-                "\n\n".join(docs)[: self.doc_content_chars_max]
+            return "\n\n".join(docs)[: self.doc_content_chars_max]
-                if docs
+        else:
-                else "No good Arxiv Result was found"
+            return "No good Arxiv Result was found"
            )
        except self.arxiv_exceptions as ex:
            return f"Arxiv exception: {ex}"
    def load(self, query: str) -> List[Document]:
        """
@ -98,22 +99,30 @@ class ArxivAPIWrapper(BaseModel):
        try:
            import fitz
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "PyMuPDF package not found, please install it with "
                "`pip install pymupdf`"
            )
        try:
-            docs: List[Document] = []
+            results = self.arxiv_search(  # type: ignore
            for result in self.arxiv_search(  # type: ignore
                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
-            ).results():
+            ).results()
        except self.arxiv_exceptions as ex:
            logger.debug("Error on arxiv: %s", ex)
            return []
        docs: List[Document] = []
        for result in results:
            try:
                doc_file_name: str = result.download_pdf()
                with fitz.open(doc_file_name) as doc_file:
                    text: str = "".join(page.get_text() for page in doc_file)
-                        add_meta = (
+            except FileNotFoundError as f_ex:
-                            {
+                logger.debug(f_ex)
                continue
            if self.load_all_available_meta:
                extra_metadata = {
                    "entry_id": result.entry_id,
                    "published_first_time": str(result.published.date()),
                    "comment": result.comment,
@ -123,27 +132,18 @@ class ArxivAPIWrapper(BaseModel):
                    "categories": result.categories,
                    "links": [link.href for link in result.links],
                }
-                            if self.load_all_available_meta
+            else:
-                            else {}
+                extra_metadata = {}
-                        )
+            metadata = {
                        doc = Document(
                            page_content=text[: self.doc_content_chars_max],
                            metadata=(
                                {
                "Published": str(result.updated.date()),
                "Title": result.title,
-                                    "Authors": ", ".join(
+                "Authors": ", ".join(a.name for a in result.authors),
                                        a.name for a in result.authors
                                    ),
                "Summary": result.summary,
-                                    **add_meta,
+                **extra_metadata,
            }
-                            ),
+            doc = Document(
                page_content=text[: self.doc_content_chars_max], metadata=metadata
            )
            docs.append(doc)
-                except FileNotFoundError as f_ex:
+            os.remove(doc_file_name)
                    logger.debug(f_ex)
        return docs
        except self.arxiv_exceptions as ex:
            logger.debug("Error on arxiv: %s", ex)
            return []