From e90654f39bf6c598936770690c82537b16627334 Mon Sep 17 00:00:00 2001
From: Steve Kim <happykbs@gmail.com>
Date: Wed, 17 May 2023 09:26:56 +0900
Subject: [PATCH] Added cleaning up the downloaded PDF files (#4601)

ArxivAPIWrapper searches and downloads PDFs to get related information.
But I found that it doesn't delete the downloaded file. The reason why
this is a problem is that a lot of PDF files remain on the server. For
example, one size is about 28M.
So, I added a delete line because it's too big to maintain on the
server.

# Clean up downloaded PDF files
- Changes: Added new line to delete downloaded file
- Background: To get the information on arXiv's paper, ArxivAPIWrapper
class downloads a PDF.
It's a natural approach, but the wrapper retains a lot of PDF files on
the server.
- Problem: One size of PDFs is about 28M. It's too big to maintain on a
small server like AWS.
- Dependency: import os

Thank you.

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
---
 langchain/utilities/arxiv.py | 106 +++++++++++++++++------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/langchain/utilities/arxiv.py b/langchain/utilities/arxiv.py
index dcf80594bb..82ba07814e 100644
--- a/langchain/utilities/arxiv.py
+++ b/langchain/utilities/arxiv.py
@@ -1,5 +1,6 @@
 """Util that calls Arxiv."""
 import logging
+import os
 from typing import Any, Dict, List
 
 from pydantic import BaseModel, Extra, root_validator
@@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
         It uses only the most informative fields of article meta information.
         """
         try:
-            docs = [
-                f"Published: {result.updated.date()}\nTitle: {result.title}\n"
-                f"Authors: {', '.join(a.name for a in result.authors)}\n"
-                f"Summary: {result.summary}"
-                for result in self.arxiv_search(  # type: ignore
-                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
-                ).results()
-            ]
-            return (
-                "\n\n".join(docs)[: self.doc_content_chars_max]
-                if docs
-                else "No good Arxiv Result was found"
-            )
+            results = self.arxiv_search(  # type: ignore
+                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
+            ).results()
         except self.arxiv_exceptions as ex:
             return f"Arxiv exception: {ex}"
+        docs = [
+            f"Published: {result.updated.date()}\nTitle: {result.title}\n"
+            f"Authors: {', '.join(a.name for a in result.authors)}\n"
+            f"Summary: {result.summary}"
+            for result in results
+        ]
+        if docs:
+            return "\n\n".join(docs)[: self.doc_content_chars_max]
+        else:
+            return "No good Arxiv Result was found"
 
     def load(self, query: str) -> List[Document]:
         """
@@ -98,52 +99,51 @@ class ArxivAPIWrapper(BaseModel):
         try:
             import fitz
         except ImportError:
-            raise ValueError(
+            raise ImportError(
                 "PyMuPDF package not found, please install it with "
                 "`pip install pymupdf`"
             )
 
         try:
-            docs: List[Document] = []
-            for result in self.arxiv_search(  # type: ignore
+            results = self.arxiv_search(  # type: ignore
                 query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
-            ).results():
-                try:
-                    doc_file_name: str = result.download_pdf()
-                    with fitz.open(doc_file_name) as doc_file:
-                        text: str = "".join(page.get_text() for page in doc_file)
-                        add_meta = (
-                            {
-                                "entry_id": result.entry_id,
-                                "published_first_time": str(result.published.date()),
-                                "comment": result.comment,
-                                "journal_ref": result.journal_ref,
-                                "doi": result.doi,
-                                "primary_category": result.primary_category,
-                                "categories": result.categories,
-                                "links": [link.href for link in result.links],
-                            }
-                            if self.load_all_available_meta
-                            else {}
-                        )
-                        doc = Document(
-                            page_content=text[: self.doc_content_chars_max],
-                            metadata=(
-                                {
-                                    "Published": str(result.updated.date()),
-                                    "Title": result.title,
-                                    "Authors": ", ".join(
-                                        a.name for a in result.authors
-                                    ),
-                                    "Summary": result.summary,
-                                    **add_meta,
-                                }
-                            ),
-                        )
-                        docs.append(doc)
-                except FileNotFoundError as f_ex:
-                    logger.debug(f_ex)
-            return docs
+            ).results()
         except self.arxiv_exceptions as ex:
             logger.debug("Error on arxiv: %s", ex)
             return []
+
+        docs: List[Document] = []
+        for result in results:
+            try:
+                doc_file_name: str = result.download_pdf()
+                with fitz.open(doc_file_name) as doc_file:
+                    text: str = "".join(page.get_text() for page in doc_file)
+            except FileNotFoundError as f_ex:
+                logger.debug(f_ex)
+                continue
+            if self.load_all_available_meta:
+                extra_metadata = {
+                    "entry_id": result.entry_id,
+                    "published_first_time": str(result.published.date()),
+                    "comment": result.comment,
+                    "journal_ref": result.journal_ref,
+                    "doi": result.doi,
+                    "primary_category": result.primary_category,
+                    "categories": result.categories,
+                    "links": [link.href for link in result.links],
+                }
+            else:
+                extra_metadata = {}
+            metadata = {
+                "Published": str(result.updated.date()),
+                "Title": result.title,
+                "Authors": ", ".join(a.name for a in result.authors),
+                "Summary": result.summary,
+                **extra_metadata,
+            }
+            doc = Document(
+                page_content=text[: self.doc_content_chars_max], metadata=metadata
+            )
+            docs.append(doc)
+            os.remove(doc_file_name)
+        return docs