mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
cleanup: unify 3 different pdf loaders, rename PagedPDFSplitter (#1615)
`OnlinePDFLoader` and `PagedPDFSplitter` lived separate from the rest of the pdf loaders. Because they're all similar, I propose moving all to `pdy.py` and the same docs/examples page. Additionally, `PagedPDFSplitter` naming doesn't match the pattern the rest of the loaders follow, so I renamed to `PyPDFLoader` and had it inherit from `BasePDFLoader` so it can now load from remote file sources.
This commit is contained in:
parent
562d9891ea
commit
b3234bf3b0
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -55,8 +55,6 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file.
|
||||
|
||||
`Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF.
|
||||
|
||||
`CoNLL-U <./examples/CoNLL-U.html>`_: A walkthrough of how to load data from a ConLL-U file.
|
||||
|
||||
`iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com
|
||||
|
@ -24,11 +24,11 @@ from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
||||
from langchain.document_loaders.notebook import NotebookLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
||||
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
||||
from langchain.document_loaders.pdf import (
|
||||
OnlinePDFLoader,
|
||||
PDFMinerLoader,
|
||||
PyMuPDFLoader,
|
||||
PyPDFLoader,
|
||||
UnstructuredPDFLoader,
|
||||
)
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
@ -52,6 +52,9 @@ from langchain.document_loaders.youtube import (
|
||||
YoutubeLoader,
|
||||
)
|
||||
|
||||
"""Legacy: only for backwards compat. use PyPDFLoader instead"""
|
||||
PagedPDFSplitter = PyPDFLoader
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredFileLoader",
|
||||
"UnstructuredFileIOLoader",
|
||||
@ -85,6 +88,7 @@ __all__ = [
|
||||
"IFixitLoader",
|
||||
"GutenbergLoader",
|
||||
"PagedPDFSplitter",
|
||||
"PyPDFLoader",
|
||||
"EverNoteLoader",
|
||||
"AirbyteJSONLoader",
|
||||
"OnlinePDFLoader",
|
||||
|
@ -1,15 +0,0 @@
|
||||
"""Loader that loads online PDF files."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
|
||||
|
||||
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
loader = UnstructuredPDFLoader(str(self.file_path))
|
||||
return loader.load()
|
@ -1,36 +0,0 @@
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PagedPDFSplitter(BaseLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
self._file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
import pypdf
|
||||
|
||||
with open(self._file_path, "rb") as pdf_file_obj:
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={"source": self._file_path, "page": i},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
@ -65,6 +65,46 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
return bool(parsed.netloc) and bool(parsed.scheme)
|
||||
|
||||
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
loader = UnstructuredPDFLoader(str(self.file_path))
|
||||
return loader.load()
|
||||
|
||||
|
||||
class PyPDFLoader(BasePDFLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
super().__init__(file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
import pypdf
|
||||
|
||||
with open(self.file_path, "rb") as pdf_file_obj:
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={"source": self.file_path, "page": i},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
|
||||
class PDFMinerLoader(BasePDFLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files."""
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""Test splitting with page numbers included."""
|
||||
import os
|
||||
|
||||
from langchain.document_loaders import PagedPDFSplitter
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
|
||||
@ -9,7 +9,7 @@ from langchain.vectorstores import FAISS
|
||||
def test_pdf_pagesplitter() -> None:
|
||||
"""Test splitting with page numbers included."""
|
||||
script_dir = os.path.dirname(__file__)
|
||||
loader = PagedPDFSplitter(os.path.join(script_dir, "examples/hello.pdf"))
|
||||
loader = PyPDFLoader(os.path.join(script_dir, "examples/hello.pdf"))
|
||||
docs = loader.load()
|
||||
assert "page" in docs[0].metadata
|
||||
assert "source" in docs[0].metadata
|
||||
|
Loading…
Reference in New Issue
Block a user