cleanup: unify 3 different pdf loaders, rename PagedPDFSplitter (#1615)

`OnlinePDFLoader` and `PagedPDFSplitter` lived separate from the rest of
the pdf loaders.

Because they're all similar, I propose moving all to `pdy.py` and the
same docs/examples page.

Additionally, `PagedPDFSplitter` naming doesn't match the pattern the
rest of the loaders follow, so I renamed to `PyPDFLoader` and had it
inherit from `BasePDFLoader` so it can now load from remote file
sources.
This commit is contained in:
Tim Asp 2023-03-13 23:06:50 -07:00 committed by GitHub
parent 562d9891ea
commit b3234bf3b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 134 additions and 153 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -55,8 +55,6 @@ There are a lot of different document loaders that LangChain supports. Below are
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
`CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file.
`iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com

View File

@ -24,11 +24,11 @@ from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain.document_loaders.notebook import NotebookLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.online_pdf import OnlinePDFLoader
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
from langchain.document_loaders.pdf import (
OnlinePDFLoader,
PDFMinerLoader,
PyMuPDFLoader,
PyPDFLoader,
UnstructuredPDFLoader,
)
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
@ -52,6 +52,9 @@ from langchain.document_loaders.youtube import (
YoutubeLoader,
)
"""Legacy: only for backwards compat. use PyPDFLoader instead"""
PagedPDFSplitter = PyPDFLoader
__all__ = [
"UnstructuredFileLoader",
"UnstructuredFileIOLoader",
@ -85,6 +88,7 @@ __all__ = [
"IFixitLoader",
"GutenbergLoader",
"PagedPDFSplitter",
"PyPDFLoader",
"EverNoteLoader",
"AirbyteJSONLoader",
"OnlinePDFLoader",

View File

@ -1,15 +0,0 @@
"""Loader that loads online PDF files."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs."""
def load(self) -> List[Document]:
"""Load documents."""
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()

View File

@ -1,36 +0,0 @@
"""Loads a PDF with pypdf and chunks at character level."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class PagedPDFSplitter(BaseLoader):
"""Loads a PDF with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import pypdf # noqa:F401
except ImportError:
raise ValueError(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
self._file_path = file_path
def load(self) -> List[Document]:
"""Load given path as pages."""
import pypdf
with open(self._file_path, "rb") as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj)
return [
Document(
page_content=page.extract_text(),
metadata={"source": self._file_path, "page": i},
)
for i, page in enumerate(pdf_reader.pages)
]

View File

@ -65,6 +65,46 @@ class BasePDFLoader(BaseLoader, ABC):
return bool(parsed.netloc) and bool(parsed.scheme)
class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs."""
def load(self) -> List[Document]:
"""Load documents."""
loader = UnstructuredPDFLoader(str(self.file_path))
return loader.load()
class PyPDFLoader(BasePDFLoader):
"""Loads a PDF with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import pypdf # noqa:F401
except ImportError:
raise ValueError(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
super().__init__(file_path)
def load(self) -> List[Document]:
"""Load given path as pages."""
import pypdf
with open(self.file_path, "rb") as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj)
return [
Document(
page_content=page.extract_text(),
metadata={"source": self.file_path, "page": i},
)
for i, page in enumerate(pdf_reader.pages)
]
class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files."""

View File

@ -1,7 +1,7 @@
"""Test splitting with page numbers included."""
import os
from langchain.document_loaders import PagedPDFSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
@ -9,7 +9,7 @@ from langchain.vectorstores import FAISS
def test_pdf_pagesplitter() -> None:
"""Test splitting with page numbers included."""
script_dir = os.path.dirname(__file__)
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
loader = PyPDFLoader(os.path.join(script_dir, "examples/hello.pdf"))
docs = loader.load()
assert "page" in docs[0].metadata
assert "source" in docs[0].metadata