Harrison/remote paths pdf (#1544)

Co-authored-by: Tim Asp <707699+timothyasp@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-03-08 20:53:37 -08:00 committed by GitHub
parent cc423f40f1
commit 357d808484
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 81 additions and 29 deletions

View File

@ -1,30 +1,15 @@
"""Loader that loads online PDF files.""" """Loader that loads online PDF files."""
import tempfile
from pathlib import Path
from typing import List from typing import List
import requests
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
class OnlinePDFLoader(BaseLoader): class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs.""" """Loader that loads online PDFs."""
def __init__(self, web_path: str):
"""Initialize with file path."""
self.web_path = web_path
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
r = requests.get(self.web_path) loader = UnstructuredPDFLoader(str(self.file_path))
with tempfile.TemporaryDirectory() as temp_dir: return loader.load()
file_path = Path(temp_dir) / "online_file.pdf"
file = open(file_path, "wb")
file.write(r.content)
file.close()
loader = UnstructuredPDFLoader(str(file_path))
return loader.load()

View File

@ -1,5 +1,11 @@
"""Loader that loads PDF files.""" """Loader that loads PDF files."""
import os
import tempfile
from abc import ABC
from typing import Any, List, Optional from typing import Any, List, Optional
from urllib.parse import urlparse
import requests
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -15,7 +21,51 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
return partition_pdf(filename=self.file_path) return partition_pdf(filename=self.file_path)
class PDFMinerLoader(BaseLoader): class BasePDFLoader(BaseLoader, ABC):
"""Base loader class for PDF files.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion
"""
file_path: str
web_path: Optional[str] = None
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
r = requests.get(self.file_path)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
self.web_path = self.file_path
self.temp_file = tempfile.NamedTemporaryFile()
self.temp_file.write(r.content)
self.file_path = self.temp_file.name
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_file"):
self.temp_file.close()
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files.""" """Loader that uses PDFMiner to load PDF files."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
@ -28,7 +78,7 @@ class PDFMinerLoader(BaseLoader):
"`pip install pdfminer.six`" "`pip install pdfminer.six`"
) )
self.file_path = file_path super().__init__(file_path)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load file.""" """Load file."""
@ -39,7 +89,7 @@ class PDFMinerLoader(BaseLoader):
return [Document(page_content=text, metadata=metadata)] return [Document(page_content=text, metadata=metadata)]
class PyMuPDFLoader(BaseLoader): class PyMuPDFLoader(BasePDFLoader):
"""Loader that uses PyMuPDF to load PDF files.""" """Loader that uses PyMuPDF to load PDF files."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
@ -52,22 +102,30 @@ class PyMuPDFLoader(BaseLoader):
"`pip install pymupdf`" "`pip install pymupdf`"
) )
self.file_path = file_path super().__init__(file_path)
def load(self, **kwargs: Optional[Any]) -> List[Document]: def load(self, **kwargs: Optional[Any]) -> List[Document]:
"""Load file.""" """Load file."""
import fitz import fitz
doc = fitz.open(self.file_path) # open document doc = fitz.open(self.file_path) # open document
file_path = self.file_path if self.web_path is None else self.web_path
return [ return [
Document( Document(
page_content=page.get_text(**kwargs).encode("utf-8"), page_content=page.get_text(**kwargs).encode("utf-8"),
metadata={ metadata=dict(
"file_path": self.file_path, {
"page_number": page.number + 1, "file_path": file_path,
"total_pages": len(doc), "page_number": page.number + 1,
} "total_pages": len(doc),
| doc.metadata, },
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
}
),
) )
for page in doc for page in doc
] ]

View File

@ -44,3 +44,12 @@ def test_pymupdf_loader() -> None:
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16
assert loader.web_path is None
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
loader = PyMuPDFLoader(web_path)
docs = loader.load()
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1