mirror of https://github.com/hwchase17/langchain
cleanup: unify 3 different pdf loaders, rename PagedPDFSplitter (#1615)
`OnlinePDFLoader` and `PagedPDFSplitter` lived separate from the rest of the pdf loaders. Because they're all similar, I propose moving all to `pdy.py` and the same docs/examples page. Additionally, `PagedPDFSplitter` naming doesn't match the pattern the rest of the loaders follow, so I renamed to `PyPDFLoader` and had it inherit from `BasePDFLoader` so it can now load from remote file sources.pull/904/head
parent
562d9891ea
commit
b3234bf3b0
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,15 +0,0 @@
|
||||
"""Loader that loads online PDF files."""
|
||||
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
|
||||
|
||||
|
||||
class OnlinePDFLoader(BasePDFLoader):
|
||||
"""Loader that loads online PDFs."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
loader = UnstructuredPDFLoader(str(self.file_path))
|
||||
return loader.load()
|
@ -1,36 +0,0 @@
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class PagedPDFSplitter(BaseLoader):
|
||||
"""Loads a PDF with pypdf and chunks at character level.
|
||||
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
self._file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load given path as pages."""
|
||||
import pypdf
|
||||
|
||||
with open(self._file_path, "rb") as pdf_file_obj:
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={"source": self._file_path, "page": i},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
Loading…
Reference in New Issue