Added cleaning up the downloaded PDF files (#4601)

ArxivAPIWrapper searches and downloads PDFs to get related information.
But I found that it doesn't delete the downloaded file. The reason why
this is a problem is that a lot of PDF files remain on the server. For
example, one size is about 28M.
So, I added a delete line because it's too big to maintain on the
server.

# Clean up downloaded PDF files
- Changes: Added new line to delete downloaded file
- Background: To get the information on arXiv's paper, ArxivAPIWrapper
class downloads a PDF.
It's a natural approach, but the wrapper retains a lot of PDF files on
the server.
- Problem: One size of PDFs is about 28M. It's too big to maintain on a
small server like AWS.
- Dependency: import os

Thank you.

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
pull/4655/head
Steve Kim 1 year ago committed by GitHub
parent 6fbd5e837f
commit e90654f39b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,6 @@
"""Util that calls Arxiv.""" """Util that calls Arxiv."""
import logging import logging
import os
from typing import Any, Dict, List from typing import Any, Dict, List
from pydantic import BaseModel, Extra, root_validator from pydantic import BaseModel, Extra, root_validator
@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
It uses only the most informative fields of article meta information. It uses only the most informative fields of article meta information.
""" """
try: try:
results = self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
).results()
except self.arxiv_exceptions as ex:
return f"Arxiv exception: {ex}"
docs = [ docs = [
f"Published: {result.updated.date()}\nTitle: {result.title}\n" f"Published: {result.updated.date()}\nTitle: {result.title}\n"
f"Authors: {', '.join(a.name for a in result.authors)}\n" f"Authors: {', '.join(a.name for a in result.authors)}\n"
f"Summary: {result.summary}" f"Summary: {result.summary}"
for result in self.arxiv_search( # type: ignore for result in results
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
).results()
] ]
return ( if docs:
"\n\n".join(docs)[: self.doc_content_chars_max] return "\n\n".join(docs)[: self.doc_content_chars_max]
if docs else:
else "No good Arxiv Result was found" return "No good Arxiv Result was found"
)
except self.arxiv_exceptions as ex:
return f"Arxiv exception: {ex}"
def load(self, query: str) -> List[Document]: def load(self, query: str) -> List[Document]:
""" """
@ -98,22 +99,30 @@ class ArxivAPIWrapper(BaseModel):
try: try:
import fitz import fitz
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"PyMuPDF package not found, please install it with " "PyMuPDF package not found, please install it with "
"`pip install pymupdf`" "`pip install pymupdf`"
) )
try: try:
docs: List[Document] = [] results = self.arxiv_search( # type: ignore
for result in self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
).results(): ).results()
except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex)
return []
docs: List[Document] = []
for result in results:
try: try:
doc_file_name: str = result.download_pdf() doc_file_name: str = result.download_pdf()
with fitz.open(doc_file_name) as doc_file: with fitz.open(doc_file_name) as doc_file:
text: str = "".join(page.get_text() for page in doc_file) text: str = "".join(page.get_text() for page in doc_file)
add_meta = ( except FileNotFoundError as f_ex:
{ logger.debug(f_ex)
continue
if self.load_all_available_meta:
extra_metadata = {
"entry_id": result.entry_id, "entry_id": result.entry_id,
"published_first_time": str(result.published.date()), "published_first_time": str(result.published.date()),
"comment": result.comment, "comment": result.comment,
@ -123,27 +132,18 @@ class ArxivAPIWrapper(BaseModel):
"categories": result.categories, "categories": result.categories,
"links": [link.href for link in result.links], "links": [link.href for link in result.links],
} }
if self.load_all_available_meta else:
else {} extra_metadata = {}
) metadata = {
doc = Document(
page_content=text[: self.doc_content_chars_max],
metadata=(
{
"Published": str(result.updated.date()), "Published": str(result.updated.date()),
"Title": result.title, "Title": result.title,
"Authors": ", ".join( "Authors": ", ".join(a.name for a in result.authors),
a.name for a in result.authors
),
"Summary": result.summary, "Summary": result.summary,
**add_meta, **extra_metadata,
} }
), doc = Document(
page_content=text[: self.doc_content_chars_max], metadata=metadata
) )
docs.append(doc) docs.append(doc)
except FileNotFoundError as f_ex: os.remove(doc_file_name)
logger.debug(f_ex)
return docs return docs
except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex)
return []

Loading…
Cancel
Save