diff --git a/langchain/document_loaders/online_pdf.py b/langchain/document_loaders/online_pdf.py index ad0b17f8..4bc03ef6 100644 --- a/langchain/document_loaders/online_pdf.py +++ b/langchain/document_loaders/online_pdf.py @@ -1,30 +1,15 @@ """Loader that loads online PDF files.""" -import tempfile -from pathlib import Path from typing import List -import requests - from langchain.docstore.document import Document -from langchain.document_loaders.base import BaseLoader -from langchain.document_loaders.pdf import UnstructuredPDFLoader +from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader -class OnlinePDFLoader(BaseLoader): +class OnlinePDFLoader(BasePDFLoader): """Loader that loads online PDFs.""" - def __init__(self, web_path: str): - """Initialize with file path.""" - self.web_path = web_path - def load(self) -> List[Document]: """Load documents.""" - r = requests.get(self.web_path) - with tempfile.TemporaryDirectory() as temp_dir: - file_path = Path(temp_dir) / "online_file.pdf" - file = open(file_path, "wb") - file.write(r.content) - file.close() - loader = UnstructuredPDFLoader(str(file_path)) - return loader.load() + loader = UnstructuredPDFLoader(str(self.file_path)) + return loader.load() diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 915b9fc8..9314e358 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -1,5 +1,11 @@ """Loader that loads PDF files.""" +import os +import tempfile +from abc import ABC from typing import Any, List, Optional +from urllib.parse import urlparse + +import requests from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -15,7 +21,51 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): return partition_pdf(filename=self.file_path) -class PDFMinerLoader(BaseLoader): +class BasePDFLoader(BaseLoader, ABC): + """Base loader class for PDF files. + + Defaults to check for local file, but if the file is a web path, it will download it + to a temporary file, and use that, then clean up the temporary file after completion + """ + + file_path: str + web_path: Optional[str] = None + + def __init__(self, file_path: str): + """Initialize with file path.""" + self.file_path = file_path + if "~" in self.file_path: + self.file_path = os.path.expanduser(self.file_path) + + # If the file is a web path, download it to a temporary file, and use that + if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): + r = requests.get(self.file_path) + + if r.status_code != 200: + raise ValueError( + "Check the url of your file; returned status code %s" + % r.status_code + ) + + self.web_path = self.file_path + self.temp_file = tempfile.NamedTemporaryFile() + self.temp_file.write(r.content) + self.file_path = self.temp_file.name + elif not os.path.isfile(self.file_path): + raise ValueError("File path %s is not a valid file or url" % self.file_path) + + def __del__(self) -> None: + if hasattr(self, "temp_file"): + self.temp_file.close() + + @staticmethod + def _is_valid_url(url: str) -> bool: + """Check if the url is valid.""" + parsed = urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + + +class PDFMinerLoader(BasePDFLoader): """Loader that uses PDFMiner to load PDF files.""" def __init__(self, file_path: str): @@ -28,7 +78,7 @@ class PDFMinerLoader(BaseLoader): "`pip install pdfminer.six`" ) - self.file_path = file_path + super().__init__(file_path) def load(self) -> List[Document]: """Load file.""" @@ -39,7 +89,7 @@ class PDFMinerLoader(BaseLoader): return [Document(page_content=text, metadata=metadata)] -class PyMuPDFLoader(BaseLoader): +class PyMuPDFLoader(BasePDFLoader): """Loader that uses PyMuPDF to load PDF files.""" def __init__(self, file_path: str): @@ -52,22 +102,30 @@ class PyMuPDFLoader(BaseLoader): "`pip install pymupdf`" ) - self.file_path = file_path + super().__init__(file_path) def load(self, **kwargs: Optional[Any]) -> List[Document]: """Load file.""" import fitz doc = fitz.open(self.file_path) # open document + file_path = self.file_path if self.web_path is None else self.web_path + return [ Document( page_content=page.get_text(**kwargs).encode("utf-8"), - metadata={ - "file_path": self.file_path, - "page_number": page.number + 1, - "total_pages": len(doc), - } - | doc.metadata, + metadata=dict( + { + "file_path": file_path, + "page_number": page.number + 1, + "total_pages": len(doc), + }, + **{ + k: doc.metadata[k] + for k in doc.metadata + if type(doc.metadata[k]) in [str, int] + } + ), ) for page in doc ] diff --git a/tests/integration_tests/document_loaders/test_pdf.py b/tests/integration_tests/document_loaders/test_pdf.py index 8e59f310..dc046b47 100644 --- a/tests/integration_tests/document_loaders/test_pdf.py +++ b/tests/integration_tests/document_loaders/test_pdf.py @@ -44,3 +44,12 @@ def test_pymupdf_loader() -> None: docs = loader.load() assert len(docs) == 16 + assert loader.web_path is None + + web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf" + loader = PyMuPDFLoader(web_path) + + docs = loader.load() + assert loader.web_path == web_path + assert loader.file_path != web_path + assert len(docs) == 1 diff --git a/tests/integration_tests/examples/layout-parser-paper.pdf b/tests/integration_tests/examples/layout-parser-paper.pdf new file mode 100644 index 00000000..c4b6c2ef Binary files /dev/null and b/tests/integration_tests/examples/layout-parser-paper.pdf differ