This commit is contained in:
Harrison Chase 2023-02-12 07:29:26 -08:00 committed by GitHub
parent 0b6aa6a024
commit bbb06ca4cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 64 additions and 2 deletions

View File

@ -167,10 +167,48 @@
"data = loader.load()"
]
},
{
"cell_type": "markdown",
"id": "21998d18",
"metadata": {},
"source": [
"## Using PDFMiner"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2f0cc9ff",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PDFMinerLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "42b531e8",
"metadata": {},
"outputs": [],
"source": [
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "010d5cdd",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54fb6b62",
"id": "7301c473",
"metadata": {},
"outputs": [],
"source": []

View File

@ -17,7 +17,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.online_pdf import OnlinePDFLoader
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from langchain.document_loaders.pdf import PDFMinerLoader, UnstructuredPDFLoader
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.roam import RoamLoader
@ -58,4 +58,5 @@ __all__ = [
"EveryNoteLoader",
"AirbyteJSONLoader",
"OnlinePDFLoader",
"PDFMinerLoader",
]

View File

@ -27,3 +27,26 @@ class UnstructuredPDFLoader(BaseLoader):
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
class PDFMinerLoader(BaseLoader):
"""Loader that uses PDFMiner to load PDF files."""
def __init__(self, file_path: str):
"""Initialize with file path."""
try:
from pdfminer.high_level import extract_text # noqa:F401
except ImportError:
raise ValueError(
"pdfminer package not found, please install it with "
"`pip install pdfminer.six`"
)
self.file_path = file_path
def load(self) -> List[Document]:
"""Load file."""
from pdfminer.high_level import extract_text
text = extract_text(self.file_path)
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]