Added cleaning up the downloaded PDF files (#4601)

ArxivAPIWrapper searches and downloads PDFs to get related information.
But I found that it doesn't delete the downloaded file. The reason why
this is a problem is that a lot of PDF files remain on the server. For
example, one size is about 28M.
So, I added a delete line because it's too big to maintain on the
server.

# Clean up downloaded PDF files
- Changes: Added new line to delete downloaded file
- Background: To get the information on arXiv's paper, ArxivAPIWrapper
class downloads a PDF.
It's a natural approach, but the wrapper retains a lot of PDF files on
the server.
- Problem: One size of PDFs is about 28M. It's too big to maintain on a
small server like AWS.
- Dependency: import os

Thank you.

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
pull/4655/head
Steve Kim 1 year ago committed by GitHub
parent 6fbd5e837f
commit e90654f39b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,6 @@
"""Util that calls Arxiv."""
import logging
import os
from typing import Any, Dict, List
from pydantic import BaseModel, Extra, root_validator
@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
It uses only the most informative fields of article meta information.
"""
try:
docs = [
f"Published: {result.updated.date()}\nTitle: {result.title}\n"
f"Authors: {', '.join(a.name for a in result.authors)}\n"
f"Summary: {result.summary}"
for result in self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
).results()
]
return (
"\n\n".join(docs)[: self.doc_content_chars_max]
if docs
else "No good Arxiv Result was found"
)
results = self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
).results()
except self.arxiv_exceptions as ex:
return f"Arxiv exception: {ex}"
docs = [
f"Published: {result.updated.date()}\nTitle: {result.title}\n"
f"Authors: {', '.join(a.name for a in result.authors)}\n"
f"Summary: {result.summary}"
for result in results
]
if docs:
return "\n\n".join(docs)[: self.doc_content_chars_max]
else:
return "No good Arxiv Result was found"
def load(self, query: str) -> List[Document]:
"""
@ -98,52 +99,51 @@ class ArxivAPIWrapper(BaseModel):
try:
import fitz
except ImportError:
raise ValueError(
raise ImportError(
"PyMuPDF package not found, please install it with "
"`pip install pymupdf`"
)
try:
docs: List[Document] = []
for result in self.arxiv_search( # type: ignore
results = self.arxiv_search( # type: ignore
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
).results():
try:
doc_file_name: str = result.download_pdf()
with fitz.open(doc_file_name) as doc_file:
text: str = "".join(page.get_text() for page in doc_file)
add_meta = (
{
"entry_id": result.entry_id,
"published_first_time": str(result.published.date()),
"comment": result.comment,
"journal_ref": result.journal_ref,
"doi": result.doi,
"primary_category": result.primary_category,
"categories": result.categories,
"links": [link.href for link in result.links],
}
if self.load_all_available_meta
else {}
)
doc = Document(
page_content=text[: self.doc_content_chars_max],
metadata=(
{
"Published": str(result.updated.date()),
"Title": result.title,
"Authors": ", ".join(
a.name for a in result.authors
),
"Summary": result.summary,
**add_meta,
}
),
)
docs.append(doc)
except FileNotFoundError as f_ex:
logger.debug(f_ex)
return docs
).results()
except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex)
return []
docs: List[Document] = []
for result in results:
try:
doc_file_name: str = result.download_pdf()
with fitz.open(doc_file_name) as doc_file:
text: str = "".join(page.get_text() for page in doc_file)
except FileNotFoundError as f_ex:
logger.debug(f_ex)
continue
if self.load_all_available_meta:
extra_metadata = {
"entry_id": result.entry_id,
"published_first_time": str(result.published.date()),
"comment": result.comment,
"journal_ref": result.journal_ref,
"doi": result.doi,
"primary_category": result.primary_category,
"categories": result.categories,
"links": [link.href for link in result.links],
}
else:
extra_metadata = {}
metadata = {
"Published": str(result.updated.date()),
"Title": result.title,
"Authors": ", ".join(a.name for a in result.authors),
"Summary": result.summary,
**extra_metadata,
}
doc = Document(
page_content=text[: self.doc_content_chars_max], metadata=metadata
)
docs.append(doc)
os.remove(doc_file_name)
return docs

Loading…
Cancel
Save