Harrison/remote paths pdf (#1544)

Co-authored-by: Tim Asp <707699+timothyasp@users.noreply.github.com>
1 year ago · 357d808484
parent cc423f40f1
commit 357d808484
4 changed files with 81 additions and 29 deletions
--- a/langchain/document_loaders/online_pdf.py
+++ b/langchain/document_loaders/online_pdf.py
@ -1,30 +1,15 @@
 """Loader that loads online PDF files."""

-import tempfile
-from pathlib import Path
 from typing import List

-import requests
-
 from langchain.docstore.document import Document
-from langchain.document_loaders.base import BaseLoader
-from langchain.document_loaders.pdf import UnstructuredPDFLoader
+from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader


-class OnlinePDFLoader(BaseLoader):
+class OnlinePDFLoader(BasePDFLoader):
    """Loader that loads online PDFs."""

-    def __init__(self, web_path: str):
-        """Initialize with file path."""
-        self.web_path = web_path
-
    def load(self) -> List[Document]:
        """Load documents."""
-        r = requests.get(self.web_path)
-        with tempfile.TemporaryDirectory() as temp_dir:
-            file_path = Path(temp_dir) / "online_file.pdf"
-            file = open(file_path, "wb")
-            file.write(r.content)
-            file.close()
-            loader = UnstructuredPDFLoader(str(file_path))
-            return loader.load()
+        loader = UnstructuredPDFLoader(str(self.file_path))
+        return loader.load()
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -1,5 +1,11 @@
 """Loader that loads PDF files."""
+import os
+import tempfile
+from abc import ABC
 from typing import Any, List, Optional
+from urllib.parse import urlparse
+
+import requests

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@ -15,7 +21,51 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
        return partition_pdf(filename=self.file_path)


-class PDFMinerLoader(BaseLoader):
+class BasePDFLoader(BaseLoader, ABC):
+    """Base loader class for PDF files.
+
+    Defaults to check for local file, but if the file is a web path, it will download it
+    to a temporary file, and use that, then clean up the temporary file after completion
+    """
+
+    file_path: str
+    web_path: Optional[str] = None
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        self.file_path = file_path
+        if "~" in self.file_path:
+            self.file_path = os.path.expanduser(self.file_path)
+
+        # If the file is a web path, download it to a temporary file, and use that
+        if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
+            r = requests.get(self.file_path)
+
+            if r.status_code != 200:
+                raise ValueError(
+                    "Check the url of your file; returned status code %s"
+                    % r.status_code
+                )
+
+            self.web_path = self.file_path
+            self.temp_file = tempfile.NamedTemporaryFile()
+            self.temp_file.write(r.content)
+            self.file_path = self.temp_file.name
+        elif not os.path.isfile(self.file_path):
+            raise ValueError("File path %s is not a valid file or url" % self.file_path)
+
+    def __del__(self) -> None:
+        if hasattr(self, "temp_file"):
+            self.temp_file.close()
+
+    @staticmethod
+    def _is_valid_url(url: str) -> bool:
+        """Check if the url is valid."""
+        parsed = urlparse(url)
+        return bool(parsed.netloc) and bool(parsed.scheme)
+
+
+class PDFMinerLoader(BasePDFLoader):
    """Loader that uses PDFMiner to load PDF files."""

    def __init__(self, file_path: str):
@ -28,7 +78,7 @@ class PDFMinerLoader(BaseLoader):
                "`pip install pdfminer.six`"
            )

-        self.file_path = file_path
+        super().__init__(file_path)

    def load(self) -> List[Document]:
        """Load file."""
@ -39,7 +89,7 @@ class PDFMinerLoader(BaseLoader):
        return [Document(page_content=text, metadata=metadata)]


-class PyMuPDFLoader(BaseLoader):
+class PyMuPDFLoader(BasePDFLoader):
    """Loader that uses PyMuPDF to load PDF files."""

    def __init__(self, file_path: str):
@ -52,22 +102,30 @@ class PyMuPDFLoader(BaseLoader):
                "`pip install pymupdf`"
            )

-        self.file_path = file_path
+        super().__init__(file_path)

    def load(self, **kwargs: Optional[Any]) -> List[Document]:
        """Load file."""
        import fitz

        doc = fitz.open(self.file_path)  # open document
+        file_path = self.file_path if self.web_path is None else self.web_path
+
        return [
            Document(
                page_content=page.get_text(**kwargs).encode("utf-8"),
-                metadata={
-                    "file_path": self.file_path,
-                    "page_number": page.number + 1,
-                    "total_pages": len(doc),
-                }
-                | doc.metadata,
+                metadata=dict(
+                    {
+                        "file_path": file_path,
+                        "page_number": page.number + 1,
+                        "total_pages": len(doc),
+                    },
+                    **{
+                        k: doc.metadata[k]
+                        for k in doc.metadata
+                        if type(doc.metadata[k]) in [str, int]
+                    }
+                ),
            )
            for page in doc
        ]
--- a/tests/integration_tests/document_loaders/test_pdf.py
+++ b/tests/integration_tests/document_loaders/test_pdf.py
@ -44,3 +44,12 @@ def test_pymupdf_loader() -> None:

    docs = loader.load()
    assert len(docs) == 16
+    assert loader.web_path is None
+
+    web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
+    loader = PyMuPDFLoader(web_path)
+
+    docs = loader.load()
+    assert loader.web_path == web_path
+    assert loader.file_path != web_path
+    assert len(docs) == 1
--- a/tests/integration_tests/examples/layout-parser-paper.pdf
+++ b/tests/integration_tests/examples/layout-parser-paper.pdf