|
|
|
@ -337,75 +337,73 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "05187b33",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": []
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "21998d18",
|
|
|
|
|
"id": "96351714",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Using PDFMiner"
|
|
|
|
|
"## Using PyPDFium2"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"id": "2f0cc9ff",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"id": "003fcc1d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from langchain.document_loaders import PDFMinerLoader"
|
|
|
|
|
"from langchain.document_loaders import PyPDFium2Loader"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"id": "42b531e8",
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"id": "46766e29",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
|
|
|
|
|
"loader = PyPDFium2Loader(\"example_data/layout-parser-paper.pdf\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
"id": "483720b5",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"data = loader.load()"
|
|
|
|
|
]
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"id": "96351714",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"# Using PyPDFium2"
|
|
|
|
|
]
|
|
|
|
|
"## Using PDFMiner"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"id": "003fcc1d",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"from langchain.document_loaders import PyPDFium2Loader"
|
|
|
|
|
]
|
|
|
|
|
"from langchain.document_loaders import PDFMinerLoader"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"id": "46766e29",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"loader = PyPDFium2Loader(\"example_data/layout-parser-paper.pdf\")"
|
|
|
|
|
]
|
|
|
|
|
"loader = PDFMinerLoader(\"example_data/layout-parser-paper.pdf\")"
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"collapsed": false
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
@ -422,7 +420,7 @@
|
|
|
|
|
"id": "c90a5fe8",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"## Using PDFMiner to generate HTML text"
|
|
|
|
|
"### Using PDFMiner to generate HTML text"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|