diff --git a/docs/modules/document_loaders/examples/pdf.ipynb b/docs/modules/document_loaders/examples/pdf.ipynb index f4659df0..51e27581 100644 --- a/docs/modules/document_loaders/examples/pdf.ipynb +++ b/docs/modules/document_loaders/examples/pdf.ipynb @@ -167,10 +167,48 @@ "data = loader.load()" ] }, + { + "cell_type": "markdown", + "id": "21998d18", + "metadata": {}, + "source": [ + "## Using PDFMiner" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2f0cc9ff", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PDFMinerLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "42b531e8", + "metadata": {}, + "outputs": [], + "source": [ + "loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "010d5cdd", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "54fb6b62", + "id": "7301c473", "metadata": {}, "outputs": [], "source": [] diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index a8d67435..c9f25800 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -17,7 +17,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.online_pdf import OnlinePDFLoader from langchain.document_loaders.paged_pdf import PagedPDFSplitter -from langchain.document_loaders.pdf import UnstructuredPDFLoader +from langchain.document_loaders.pdf import PDFMinerLoader, UnstructuredPDFLoader from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.roam import RoamLoader @@ -58,4 +58,5 @@ __all__ = [ "EveryNoteLoader", "AirbyteJSONLoader", "OnlinePDFLoader", + "PDFMinerLoader", ] diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 687e010f..d6a9810c 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -27,3 +27,26 @@ class UnstructuredPDFLoader(BaseLoader): text = "\n\n".join([str(el) for el in elements]) metadata = {"source": self.file_path} return [Document(page_content=text, metadata=metadata)] + + +class PDFMinerLoader(BaseLoader): + """Loader that uses PDFMiner to load PDF files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + from pdfminer.high_level import extract_text # noqa:F401 + except ImportError: + raise ValueError( + "pdfminer package not found, please install it with " + "`pip install pdfminer.six`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from pdfminer.high_level import extract_text + + text = extract_text(self.file_path) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)]