mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
pdfminer (#1003)
This commit is contained in:
parent
0b6aa6a024
commit
bbb06ca4cf
@ -167,10 +167,48 @@
|
|||||||
"data = loader.load()"
|
"data = loader.load()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "21998d18",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Using PDFMiner"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "2f0cc9ff",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import PDFMinerLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "42b531e8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "010d5cdd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "54fb6b62",
|
"id": "7301c473",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
|
@ -17,7 +17,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader
|
|||||||
from langchain.document_loaders.obsidian import ObsidianLoader
|
from langchain.document_loaders.obsidian import ObsidianLoader
|
||||||
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
from langchain.document_loaders.online_pdf import OnlinePDFLoader
|
||||||
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
from langchain.document_loaders.paged_pdf import PagedPDFSplitter
|
||||||
from langchain.document_loaders.pdf import UnstructuredPDFLoader
|
from langchain.document_loaders.pdf import PDFMinerLoader, UnstructuredPDFLoader
|
||||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||||
from langchain.document_loaders.roam import RoamLoader
|
from langchain.document_loaders.roam import RoamLoader
|
||||||
@ -58,4 +58,5 @@ __all__ = [
|
|||||||
"EveryNoteLoader",
|
"EveryNoteLoader",
|
||||||
"AirbyteJSONLoader",
|
"AirbyteJSONLoader",
|
||||||
"OnlinePDFLoader",
|
"OnlinePDFLoader",
|
||||||
|
"PDFMinerLoader",
|
||||||
]
|
]
|
||||||
|
@ -27,3 +27,26 @@ class UnstructuredPDFLoader(BaseLoader):
|
|||||||
text = "\n\n".join([str(el) for el in elements])
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
metadata = {"source": self.file_path}
|
metadata = {"source": self.file_path}
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
return [Document(page_content=text, metadata=metadata)]
|
||||||
|
|
||||||
|
|
||||||
|
class PDFMinerLoader(BaseLoader):
|
||||||
|
"""Loader that uses PDFMiner to load PDF files."""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
try:
|
||||||
|
from pdfminer.high_level import extract_text # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"pdfminer package not found, please install it with "
|
||||||
|
"`pip install pdfminer.six`"
|
||||||
|
)
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load file."""
|
||||||
|
from pdfminer.high_level import extract_text
|
||||||
|
|
||||||
|
text = extract_text(self.file_path)
|
||||||
|
metadata = {"source": self.file_path}
|
||||||
|
return [Document(page_content=text, metadata=metadata)]
|
||||||
|
Loading…
Reference in New Issue
Block a user