cleanup: unify 3 different pdf loaders, rename PagedPDFSplitter (#1615)

`OnlinePDFLoader` and `PagedPDFSplitter` lived separate from the rest of
the pdf loaders.

Because they're all similar, I propose moving all to `pdy.py` and the
same docs/examples page.

Additionally, `PagedPDFSplitter` naming doesn't match the pattern the
rest of the loaders follow, so I renamed to `PyPDFLoader` and had it
inherit from `BasePDFLoader` so it can now load from remote file
sources.
tool-patch
Tim Asp 1 year ago committed by GitHub
parent 562d9891ea
commit b3234bf3b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -55,8 +55,6 @@ There are a lot of different document loaders that LangChain supports. Below are
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
`CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file.
`iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com

@ -24,11 +24,11 @@ from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain.document_loaders.notebook import NotebookLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.online_pdf import OnlinePDFLoader
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
from langchain.document_loaders.pdf import (
OnlinePDFLoader,
PDFMinerLoader,
PyMuPDFLoader,
PyPDFLoader,
UnstructuredPDFLoader,
)
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
@ -52,6 +52,9 @@ from langchain.document_loaders.youtube import (
YoutubeLoader,
)
"""Legacy: only for backwards compat. use PyPDFLoader instead"""
PagedPDFSplitter = PyPDFLoader
__all__ = [
"UnstructuredFileLoader",
"UnstructuredFileIOLoader",
@ -85,6 +88,7 @@ __all__ = [
"IFixitLoader",
"GutenbergLoader",
"PagedPDFSplitter",
"PyPDFLoader",
"EverNoteLoader",
"AirbyteJSONLoader",
"OnlinePDFLoader",

@ -1,15 +0,0 @@
"""Loader that loads online PDF files."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs."""
def load(self) -> List[Document]:
"""Load documents."""
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()

@ -1,36 +0,0 @@
"""Loads a PDF with pypdf and chunks at character level."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class PagedPDFSplitter(BaseLoader):
"""Loads a PDF with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import pypdf # noqa:F401
except ImportError:
raise ValueError(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
self._file_path = file_path
def load(self) -> List[Document]:
"""Load given path as pages."""
import pypdf
with open(self._file_path, "rb") as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj)
return [
Document(
page_content=page.extract_text(),
metadata={"source": self._file_path, "page": i},
)
for i, page in enumerate(pdf_reader.pages)
]

@ -65,6 +65,46 @@ class BasePDFLoader(BaseLoader, ABC):
return bool(parsed.netloc) and bool(parsed.scheme)
class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs."""
def load(self) -> List[Document]:
"""Load documents."""
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()
class PyPDFLoader(BasePDFLoader):
"""Loads a PDF with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import pypdf # noqa:F401
except ImportError:
raise ValueError(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
super().__init__(file_path)
def load(self) -> List[Document]:
"""Load given path as pages."""
import pypdf
with open(self.file_path, "rb") as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj)
return [
Document(
page_content=page.extract_text(),
metadata={"source": self.file_path, "page": i},
)
for i, page in enumerate(pdf_reader.pages)
]
class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files."""

@ -1,7 +1,7 @@
"""Test splitting with page numbers included."""
import os
from langchain.document_loaders import PagedPDFSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
@ -9,7 +9,7 @@ from langchain.vectorstores import FAISS
def test_pdf_pagesplitter() -> None:
"""Test splitting with page numbers included."""
script_dir = os.path.dirname(__file__)
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
loader = PyPDFLoader(os.path.join(script_dir, "examples/hello.pdf"))
docs = loader.load()
assert "page" in docs[0].metadata
assert "source" in docs[0].metadata

Loading…
Cancel
Save