forked from Archives/langchain
cleanup: unify 3 different pdf loaders, rename PagedPDFSplitter (#1615)
`OnlinePDFLoader` and `PagedPDFSplitter` lived separate from the rest of the pdf loaders. Because they're all similar, I propose moving all to `pdy.py` and the same docs/examples page. Additionally, `PagedPDFSplitter` naming doesn't match the pattern the rest of the loaders follow, so I renamed to `PyPDFLoader` and had it inherit from `BasePDFLoader` so it can now load from remote file sources.tool-patch
parent
562d9891ea
commit
b3234bf3b0
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,15 +0,0 @@
|
|||||||
"""Loader that loads online PDF files."""
|
|
||||||
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
|
||||||
from langchain.document_loaders.pdf import BasePDFLoader, UnstructuredPDFLoader
|
|
||||||
|
|
||||||
|
|
||||||
class OnlinePDFLoader(BasePDFLoader):
|
|
||||||
"""Loader that loads online PDFs."""
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load documents."""
|
|
||||||
loader = UnstructuredPDFLoader(str(self.file_path))
|
|
||||||
return loader.load()
|
|
@ -1,36 +0,0 @@
|
|||||||
"""Loads a PDF with pypdf and chunks at character level."""
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
|
|
||||||
|
|
||||||
class PagedPDFSplitter(BaseLoader):
|
|
||||||
"""Loads a PDF with pypdf and chunks at character level.
|
|
||||||
|
|
||||||
Loader also stores page numbers in metadatas.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
|
||||||
"""Initialize with file path."""
|
|
||||||
try:
|
|
||||||
import pypdf # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
|
||||||
)
|
|
||||||
self._file_path = file_path
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load given path as pages."""
|
|
||||||
import pypdf
|
|
||||||
|
|
||||||
with open(self._file_path, "rb") as pdf_file_obj:
|
|
||||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
|
||||||
return [
|
|
||||||
Document(
|
|
||||||
page_content=page.extract_text(),
|
|
||||||
metadata={"source": self._file_path, "page": i},
|
|
||||||
)
|
|
||||||
for i, page in enumerate(pdf_reader.pages)
|
|
||||||
]
|
|
Loading…
Reference in New Issue