Harrison/remote paths pdf (#1544)

Co-authored-by: Tim Asp <707699+timothyasp@users.noreply.github.com>
fix-searx
Harrison Chase 1 year ago committed by GitHub
parent cc423f40f1
commit 357d808484
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,30 +1,15 @@
"""Loader that loads online PDF files."""
import tempfile
from pathlib import Path
from typing import List
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
class OnlinePDFLoader(BaseLoader):
class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs."""
def __init__(self, web_path: str):
"""Initialize with file path."""
self.web_path = web_path
def load(self) -> List[Document]:
"""Load documents."""
r = requests.get(self.web_path)
with tempfile.TemporaryDirectory() as temp_dir:
file_path = Path(temp_dir) / "online_file.pdf"
file = open(file_path, "wb")
file.write(r.content)
file.close()
loader = UnstructuredPDFLoader(str(file_path))
return loader.load()
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()

@ -1,5 +1,11 @@
"""Loader that loads PDF files."""
import os
import tempfile
from abc import ABC
from typing import Any, List, Optional
from urllib.parse import urlparse
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@ -15,7 +21,51 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
return partition_pdf(filename=self.file_path)
class PDFMinerLoader(BaseLoader):
class BasePDFLoader(BaseLoader, ABC):
"""Base loader class for PDF files.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion
"""
file_path: str
web_path: Optional[str] = None
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
r = requests.get(self.file_path)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
self.web_path = self.file_path
self.temp_file = tempfile.NamedTemporaryFile()
self.temp_file.write(r.content)
self.file_path = self.temp_file.name
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_file"):
self.temp_file.close()
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files."""
def __init__(self, file_path: str):
@ -28,7 +78,7 @@ class PDFMinerLoader(BaseLoader):
"`pip install pdfminer.six`"
)
self.file_path = file_path
super().__init__(file_path)
def load(self) -> List[Document]:
"""Load file."""
@ -39,7 +89,7 @@ class PDFMinerLoader(BaseLoader):
return [Document(page_content=text, metadata=metadata)]
class PyMuPDFLoader(BaseLoader):
class PyMuPDFLoader(BasePDFLoader):
"""Loader that uses PyMuPDF to load PDF files."""
def __init__(self, file_path: str):
@ -52,22 +102,30 @@ class PyMuPDFLoader(BaseLoader):
"`pip install pymupdf`"
)
self.file_path = file_path
super().__init__(file_path)
def load(self, **kwargs: Optional[Any]) -> List[Document]:
"""Load file."""
import fitz
doc = fitz.open(self.file_path) # open document
file_path = self.file_path if self.web_path is None else self.web_path
return [
Document(
page_content=page.get_text(**kwargs).encode("utf-8"),
metadata={
"file_path": self.file_path,
"page_number": page.number + 1,
"total_pages": len(doc),
}
| doc.metadata,
metadata=dict(
{
"file_path": file_path,
"page_number": page.number + 1,
"total_pages": len(doc),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if type(doc.metadata[k]) in [str, int]
}
),
)
for page in doc
]

@ -44,3 +44,12 @@ def test_pymupdf_loader() -> None:
docs = loader.load()
assert len(docs) == 16
assert loader.web_path is None
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
loader = PyMuPDFLoader(web_path)
docs = loader.load()
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1

Loading…
Cancel
Save