mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
pdfminer (#1003)
This commit is contained in:
parent
0b6aa6a024
commit
bbb06ca4cf
@ -167,10 +167,48 @@
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "21998d18",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using PDFMiner"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "2f0cc9ff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import PDFMinerLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "42b531e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "010d5cdd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "54fb6b62",
|
||||
"id": "7301c473",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
|
@ -17,7 +17,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
||||
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
||||
from langchain.document_loaders.pdf import PDFMinerLoader, UnstructuredPDFLoader
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
@ -58,4 +58,5 @@ __all__ = [
|
||||
"EveryNoteLoader",
|
||||
"AirbyteJSONLoader",
|
||||
"OnlinePDFLoader",
|
||||
"PDFMinerLoader",
|
||||
]
|
||||
|
@ -27,3 +27,26 @@ class UnstructuredPDFLoader(BaseLoader):
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
||||
|
||||
class PDFMinerLoader(BaseLoader):
|
||||
"""Loader that uses PDFMiner to load PDF files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
from pdfminer.high_level import extract_text # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pdfminer package not found, please install it with "
|
||||
"`pip install pdfminer.six`"
|
||||
)
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
from pdfminer.high_level import extract_text
|
||||
|
||||
text = extract_text(self.file_path)
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
Loading…
Reference in New Issue
Block a user