Added cleaning up the downloaded PDF files (#4601)

ArxivAPIWrapper searches and downloads PDFs to get related information. But I found that it doesn't delete the downloaded file. The reason why this is a problem is that a lot of PDF files remain on the server. For example, one size is about 28M. So, I added a delete line because it's too big to maintain on the server. # Clean up downloaded PDF files - Changes: Added new line to delete downloaded file - Background: To get the information on arXiv's paper, ArxivAPIWrapper class downloads a PDF. It's a natural approach, but the wrapper retains a lot of PDF files on the server. - Problem: One size of PDFs is about 28M. It's too big to maintain on a small server like AWS. - Dependency: import os Thank you. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
1 year ago · e90654f39b
parent 6fbd5e837f
commit e90654f39b
1 changed files with 53 additions and 53 deletions
--- a/langchain/utilities/arxiv.py
+++ b/langchain/utilities/arxiv.py
@ -1,5 +1,6 @@
 """Util that calls Arxiv."""
 import logging
+import os
 from typing import Any, Dict, List

 from pydantic import BaseModel, Extra, root_validator
@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
        It uses only the most informative fields of article meta information.
        """
        try:
-            docs = [
-                f"Published: {result.updated.date()}\nTitle: {result.title}\n"
-                f"Authors: {', '.join(a.name for a in result.authors)}\n"
-                f"Summary: {result.summary}"
-                for result in self.arxiv_search(  # type: ignore
-                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
-                ).results()
-            ]
-            return (
-                "\n\n".join(docs)[: self.doc_content_chars_max]
-                if docs
-                else "No good Arxiv Result was found"
-            )
+            results = self.arxiv_search(  # type: ignore
+                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
+            ).results()
        except self.arxiv_exceptions as ex:
            return f"Arxiv exception: {ex}"
+        docs = [
+            f"Published: {result.updated.date()}\nTitle: {result.title}\n"
+            f"Authors: {', '.join(a.name for a in result.authors)}\n"
+            f"Summary: {result.summary}"
+            for result in results
+        ]
+        if docs:
+            return "\n\n".join(docs)[: self.doc_content_chars_max]
+        else:
+            return "No good Arxiv Result was found"

    def load(self, query: str) -> List[Document]:
        """
@ -98,52 +99,51 @@ class ArxivAPIWrapper(BaseModel):
        try:
            import fitz
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "PyMuPDF package not found, please install it with "
                "`pip install pymupdf`"
            )

        try:
-            docs: List[Document] = []
-            for result in self.arxiv_search(  # type: ignore
+            results = self.arxiv_search(  # type: ignore
                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
-            ).results():
-                try:
-                    doc_file_name: str = result.download_pdf()
-                    with fitz.open(doc_file_name) as doc_file:
-                        text: str = "".join(page.get_text() for page in doc_file)
-                        add_meta = (
-                            {
-                                "entry_id": result.entry_id,
-                                "published_first_time": str(result.published.date()),
-                                "comment": result.comment,
-                                "journal_ref": result.journal_ref,
-                                "doi": result.doi,
-                                "primary_category": result.primary_category,
-                                "categories": result.categories,
-                                "links": [link.href for link in result.links],
-                            }
-                            if self.load_all_available_meta
-                            else {}
-                        )
-                        doc = Document(
-                            page_content=text[: self.doc_content_chars_max],
-                            metadata=(
-                                {
-                                    "Published": str(result.updated.date()),
-                                    "Title": result.title,
-                                    "Authors": ", ".join(
-                                        a.name for a in result.authors
-                                    ),
-                                    "Summary": result.summary,
-                                    **add_meta,
-                                }
-                            ),
-                        )
-                        docs.append(doc)
-                except FileNotFoundError as f_ex:
-                    logger.debug(f_ex)
-            return docs
+            ).results()
        except self.arxiv_exceptions as ex:
            logger.debug("Error on arxiv: %s", ex)
            return []
+
+        docs: List[Document] = []
+        for result in results:
+            try:
+                doc_file_name: str = result.download_pdf()
+                with fitz.open(doc_file_name) as doc_file:
+                    text: str = "".join(page.get_text() for page in doc_file)
+            except FileNotFoundError as f_ex:
+                logger.debug(f_ex)
+                continue
+            if self.load_all_available_meta:
+                extra_metadata = {
+                    "entry_id": result.entry_id,
+                    "published_first_time": str(result.published.date()),
+                    "comment": result.comment,
+                    "journal_ref": result.journal_ref,
+                    "doi": result.doi,
+                    "primary_category": result.primary_category,
+                    "categories": result.categories,
+                    "links": [link.href for link in result.links],
+                }
+            else:
+                extra_metadata = {}
+            metadata = {
+                "Published": str(result.updated.date()),
+                "Title": result.title,
+                "Authors": ", ".join(a.name for a in result.authors),
+                "Summary": result.summary,
+                **extra_metadata,
+            }
+            doc = Document(
+                page_content=text[: self.doc_content_chars_max], metadata=metadata
+            )
+            docs.append(doc)
+            os.remove(doc_file_name)
+        return docs