You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
20 KiB
Plaintext

1 year ago
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"jukit_cell_id": "4yTe29l2Ya"
},
"source": [
"# Document Loaders\n",
"\n",
"- loading text from local sources\n",
"- main driver is `Unstructured` python package\n",
"\n",
"## Key Concepts\n",
"\n",
"### Document\n",
"\n",
"container class for document information. contains:\n",
" - page_content\n",
" - metadata\n",
"\n",
"### Loader\n",
"\n",
"base class to load documents. exposes:\n",
" - load() -> Document\n",
"\n",
"\n",
"## Setup Unstructured\n",
"- host dependencies\n",
" - poppler: PDF rendering library\n",
"- Python deps:\n",
" - Pillow: imaging library"
]
},
{
"cell_type": "code",
"metadata": {
"jukit_cell_id": "srwyN0cVES"
},
"source": [
"# %pip install pillow (already installed)\n",
"%pip install -q unstructured[local-inference]"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Note: you may need to restart the kernel to use updated packages.\n"
}
],
"execution_count": 1
},
{
"cell_type": "code",
"metadata": {
"jukit_cell_id": "cbFv0eSeXq"
},
"source": [
"docs_dir=\"unstructured-examples\"\n",
"!mkdir -p $docs_dir\n",
"!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/example-10k.html -P $docs_dir\n",
"!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/layout-parser-paper.pdf -P $docs_dir"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"metadata": {
"jukit_cell_id": "U633RkWjYq"
},
"source": [
"[repo link](https://github.com/Unstructured-IO/unstructured#coffee-getting-started)\n",
"The easiest way to parse a document in unstructured is to use the partition brick. If you use partition brick, unstructured will detect the file type and route it to the appropriate file-specific partitioning brick. If you are using the partition brick, ensure you first install libmagic using the instructions outlined here partition will always apply the default arguments. If you need advanced features, use a document-specific brick. The partition brick currently works for .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.\n",
"\n",
"Requires detectonr2 inference (cuda ?)"
]
},
{
"cell_type": "code",
"metadata": {
"jukit_cell_id": "FJaYuFeL0U"
},
"source": [
"docs_dir=\"unstructured-examples\""
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"metadata": {
"jukit_cell_id": "9MKaXz7Bi4"
},
"source": [
"#NOTE: needs inference with facebook's detectron2\n",
"\n",
"# from unstructured.partition.auto import partition\n",
"\n",
"# elements = partition(docs_dir + \"/layout-parser-paper.pdf\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"metadata": {
"jukit_cell_id": "X4mTVZAzcD"
},
"source": [
"## Unstructured Langchain FileLoader\n",
"\n",
"Requires detectron2"
]
},
{
"cell_type": "code",
"metadata": {
"jukit_cell_id": "9k0eAtsfvh"
},
"source": [
"from langchain.document_loaders import UnstructuredFileLoader\n",
"\n",
"loader = UnstructuredFileLoader(\"./unstructured-examples/layout-parser-paper.pdf\")\n",
"\n",
"docs = loader.load()"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)\nCell \u001b[0;32mIn[10], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UnstructuredFileLoader\n\u001b[1;32m 3\u001b[0m loader \u001b[38;5;241m=\u001b[39m UnstructuredFileLoader(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./unstructured-examples/layout-parser-paper.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[0;32m/data/source/langchain/langchain/document_loaders/unstructured.py:26\u001b[0m, in \u001b[0;36mUnstructuredFileLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load file.\"\"\"\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01munstructured\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartition\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mauto\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m partition\n\u001b[0;32m---> 26\u001b[0m elements \u001b[38;5;241m=\u001b[39m \u001b[43mpartition\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;28mstr\u001b[39m(el) \u001b[38;5;28;01mfor\u001b[39;00m el \u001b[38;5;129;01min\u001b[39;00m elements])\n\u001b[1;32m 28\u001b[0m metadata \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msource\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path}\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured/partition/auto.py:44\u001b[0m, in \u001b[0;36mpartition\u001b[0;34m(filename, file, include_page_breaks)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_html(filename\u001b[38;5;241m=\u001b[39mfilename, file\u001b[38;5;241m=\u001b[39mfile, include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m filetype \u001b[38;5;241m==\u001b[39m FileType\u001b[38;5;241m.\u001b[39mPDF:\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpartition_pdf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 45\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 46\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 48\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 49\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m (filetype \u001b[38;5;241m==\u001b[39m
}
],
"execution_count": 2
},
{
"cell_type": "code",
"metadata": {
"jukit_cell_id": "1lKP9jNDd4"
},
"source": [],
"outputs": [],
"execution_count": null
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "python",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}