docs, langchain-unstructured: update langchain-unstructured docs and update ustructured-client dependency (#25451)

Be more explicit in the docs about creating an instance of the
UnstructuredClient if you want to customize it versus using sdk
parameters with the UnstructuredLoader.

Bump the unstructured-client dependency as discussed
[here](https://github.com/langchain-ai/langchain/discussions/25328#discussioncomment-10350949)

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
pull/25717/head
John 4 weeks ago committed by GitHub
parent 92abf62292
commit 5ce9a716a7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -105,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "79d3e549",
"metadata": {},
"outputs": [],
@ -131,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "8da59ef8",
"metadata": {},
"outputs": [
@ -139,17 +139,16 @@
"name": "stderr",
"output_type": "stream",
"text": [
"INFO: NumExpr defaulting to 12 threads.\n",
"INFO: pikepdf C++ to Python logger bridge initialized\n"
]
},
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -162,7 +161,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "97f7aa1f",
"metadata": {},
"outputs": [
@ -170,7 +169,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}\n"
"{'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}\n"
]
}
],
@ -188,17 +187,17 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"id": "b05604d2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-02-27T15:49:27', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}, page_content='1 2 0 2')"
]
},
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -279,7 +278,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"id": "386eb63c",
"metadata": {},
"outputs": [
@ -299,7 +298,7 @@
"Document(metadata={'source': 'example_data/fake.docx', 'category_depth': 0, 'filename': 'fake.docx', 'languages': ['por', 'cat'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': '56d531394823d81787d77a04462ed096'}, page_content='Lorem ipsum dolor sit amet.')"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -327,7 +326,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "a3d7c846",
"metadata": {},
"outputs": [
@ -375,16 +374,22 @@
"### Unstructured SDK Client\n",
"\n",
"Partitioning with the Unstructured API relies on the [Unstructured SDK\n",
"Client](https://docs.unstructured.io/api-reference/api-services/sdk).\n",
"\n",
"Below is an example showing how you can customize some features of the client and use your own `requests.Session()`, pass in an alternative `server_url`, or customize the `RetryConfig` object for more control over how failed requests are handled.\n",
"Client](https://docs.unstructured.io/api-reference/api-services/accessing-unstructured-api).\n",
"\n",
"Note that the example below may not use the latest version of the UnstructuredClient and there could be breaking changes in future releases. For the latest examples, refer to the [Unstructured Python SDK](https://docs.unstructured.io/api-reference/api-services/sdk-python) docs."
"If you want to customize the client, you will have to pass an `UnstructuredClient` instance to the `UnstructuredLoader`. Below is an example showing how you can customize features of the client such as using your own `requests.Session()`, passing an alternative `server_url`, and customizing the `RetryConfig` object. For more information about customizing the client or what additional parameters the sdk client accepts, refer to the [Unstructured Python SDK](https://docs.unstructured.io/api-reference/api-services/sdk-python) docs and the client section of the [API Parameters](https://docs.unstructured.io/api-reference/api-services/api-parameters) docs. Note that all API Parameters should be passed to the `UnstructuredLoader`."
]
},
{
"cell_type": "markdown",
"id": "ebb69c85",
"metadata": {},
"source": [
"<div class=\"alert alert-block alert-warning\"><b>Warning:</b> The example below may not use the latest version of the UnstructuredClient and there could be breaking changes in future releases. For the latest examples, refer to the <a href=\"https://docs.unstructured.io/api-reference/api-services/sdk-python\">Unstructured Python SDK</a> docs.</div>"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"id": "58e55264",
"metadata": {},
"outputs": [
@ -394,13 +399,15 @@
"text": [
"INFO: Preparing to split document for partition.\n",
"INFO: Concurrency level set to 5\n",
"INFO: Splitting pages 1 to 16 (16 total)\n",
"INFO: Determined optimal split size of 4 pages.\n",
"INFO: Partitioning 4 files with 4 page(s) each.\n",
"INFO: Partitioning set #1 (pages 1-4).\n",
"INFO: Partitioning set #2 (pages 5-8).\n",
"INFO: Partitioning set #3 (pages 9-12).\n",
"INFO: Partitioning set #4 (pages 13-16).\n",
"INFO: Splitting pages 1 to 10 (10 total)\n",
"INFO: Determined optimal split size of 2 pages.\n",
"INFO: Partitioning 5 files with 2 page(s) each.\n",
"INFO: Partitioning set #1 (pages 1-2).\n",
"INFO: Partitioning set #2 (pages 3-4).\n",
"INFO: Partitioning set #3 (pages 5-6).\n",
"INFO: Partitioning set #4 (pages 7-8).\n",
"INFO: Partitioning set #5 (pages 9-10).\n",
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
"INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general \"HTTP/1.1 200 OK\"\n",
@ -408,6 +415,7 @@
"INFO: Successfully partitioned set #2, elements added to the final result.\n",
"INFO: Successfully partitioned set #3, elements added to the final result.\n",
"INFO: Successfully partitioned set #4, elements added to the final result.\n",
"INFO: Successfully partitioned set #5, elements added to the final result.\n",
"INFO: Successfully partitioned the document.\n"
]
},
@ -429,8 +437,8 @@
" api_key_auth=os.getenv(\n",
" \"UNSTRUCTURED_API_KEY\"\n",
" ), # Note: the client API param is \"api_key_auth\" instead of \"api_key\"\n",
" client=requests.Session(),\n",
" server_url=\"https://api.unstructuredapp.io/general/v0/general\",\n",
" client=requests.Session(), # Define your own requests session\n",
" server_url=\"https://api.unstructuredapp.io/general/v0/general\", # Define your own api url\n",
" retry_config=RetryConfig(\n",
" strategy=\"backoff\",\n",
" retry_connection_errors=True,\n",
@ -440,13 +448,15 @@
" exponent=1.5,\n",
" max_elapsed_time=900000,\n",
" ),\n",
" ),\n",
" ), # Define your own retry config\n",
")\n",
"\n",
"loader = UnstructuredLoader(\n",
" \"./example_data/layout-parser-paper.pdf\",\n",
" partition_via_api=True,\n",
" client=client,\n",
" split_pdf_page=True,\n",
" split_pdf_page_range=[1, 10],\n",
")\n",
"\n",
"docs = loader.load()\n",
@ -479,17 +489,10 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "e9f1c20d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: Partitioning locally even though api_key is defined since partition_via_api=False.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
@ -542,7 +545,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.10.13"
}
},
"nbformat": 4,

@ -34,14 +34,14 @@ lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test
lint lint_diff lint_package lint_tests:
poetry run ruff .
poetry run ruff check .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
poetry run ruff check --select I $(PYTHON_FILES)
mkdir -p $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)
poetry run ruff check --select I --fix $(PYTHON_FILES)
spell_check:
poetry run codespell --toml pyproject.toml

@ -3465,28 +3465,29 @@ pyasn1 = ">=0.1.3"
[[package]]
name = "ruff"
version = "0.1.15"
version = "0.5.7"
description = "An extremely fast Python linter and code formatter, written in Rust."
optional = false
python-versions = ">=3.7"
files = [
{file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5fe8d54df166ecc24106db7dd6a68d44852d14eb0729ea4672bb4d96c320b7df"},
{file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f0bfbb53c4b4de117ac4d6ddfd33aa5fc31beeaa21d23c45c6dd249faf9126f"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0d432aec35bfc0d800d4f70eba26e23a352386be3a6cf157083d18f6f5881c8"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9405fa9ac0e97f35aaddf185a1be194a589424b8713e3b97b762336ec79ff807"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66ec24fe36841636e814b8f90f572a8c0cb0e54d8b5c2d0e300d28a0d7bffec"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:6f8ad828f01e8dd32cc58bc28375150171d198491fc901f6f98d2a39ba8e3ff5"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86811954eec63e9ea162af0ffa9f8d09088bab51b7438e8b6488b9401863c25e"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd4025ac5e87d9b80e1f300207eb2fd099ff8200fa2320d7dc066a3f4622dc6b"},
{file = "ruff-0.1.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b17b93c02cdb6aeb696effecea1095ac93f3884a49a554a9afa76bb125c114c1"},
{file = "ruff-0.1.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ddb87643be40f034e97e97f5bc2ef7ce39de20e34608f3f829db727a93fb82c5"},
{file = "ruff-0.1.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abf4822129ed3a5ce54383d5f0e964e7fef74a41e48eb1dfad404151efc130a2"},
{file = "ruff-0.1.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6c629cf64bacfd136c07c78ac10a54578ec9d1bd2a9d395efbee0935868bf852"},
{file = "ruff-0.1.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1bab866aafb53da39c2cadfb8e1c4550ac5340bb40300083eb8967ba25481447"},
{file = "ruff-0.1.15-py3-none-win32.whl", hash = "sha256:2417e1cb6e2068389b07e6fa74c306b2810fe3ee3476d5b8a96616633f40d14f"},
{file = "ruff-0.1.15-py3-none-win_amd64.whl", hash = "sha256:3837ac73d869efc4182d9036b1405ef4c73d9b1f88da2413875e34e0d6919587"},
{file = "ruff-0.1.15-py3-none-win_arm64.whl", hash = "sha256:9a933dfb1c14ec7a33cceb1e49ec4a16b51ce3c20fd42663198746efc0427360"},
{file = "ruff-0.1.15.tar.gz", hash = "sha256:f6dfa8c1b21c913c326919056c390966648b680966febcb796cc9d1aaab8564e"},
{file = "ruff-0.5.7-py3-none-linux_armv6l.whl", hash = "sha256:548992d342fc404ee2e15a242cdbea4f8e39a52f2e7752d0e4cbe88d2d2f416a"},
{file = "ruff-0.5.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:00cc8872331055ee017c4f1071a8a31ca0809ccc0657da1d154a1d2abac5c0be"},
{file = "ruff-0.5.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eaf3d86a1fdac1aec8a3417a63587d93f906c678bb9ed0b796da7b59c1114a1e"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a01c34400097b06cf8a6e61b35d6d456d5bd1ae6961542de18ec81eaf33b4cb8"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcc8054f1a717e2213500edaddcf1dbb0abad40d98e1bd9d0ad364f75c763eea"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f70284e73f36558ef51602254451e50dd6cc479f8b6f8413a95fcb5db4a55fc"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:a78ad870ae3c460394fc95437d43deb5c04b5c29297815a2a1de028903f19692"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ccd078c66a8e419475174bfe60a69adb36ce04f8d4e91b006f1329d5cd44bcf"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e31c9bad4ebf8fdb77b59cae75814440731060a09a0e0077d559a556453acbb"},
{file = "ruff-0.5.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d796327eed8e168164346b769dd9a27a70e0298d667b4ecee6877ce8095ec8e"},
{file = "ruff-0.5.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:4a09ea2c3f7778cc635e7f6edf57d566a8ee8f485f3c4454db7771efb692c499"},
{file = "ruff-0.5.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a36d8dcf55b3a3bc353270d544fb170d75d2dff41eba5df57b4e0b67a95bb64e"},
{file = "ruff-0.5.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9369c218f789eefbd1b8d82a8cf25017b523ac47d96b2f531eba73770971c9e5"},
{file = "ruff-0.5.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b88ca3db7eb377eb24fb7c82840546fb7acef75af4a74bd36e9ceb37a890257e"},
{file = "ruff-0.5.7-py3-none-win32.whl", hash = "sha256:33d61fc0e902198a3e55719f4be6b375b28f860b09c281e4bdbf783c0566576a"},
{file = "ruff-0.5.7-py3-none-win_amd64.whl", hash = "sha256:083bbcbe6fadb93cd86709037acc510f86eed5a314203079df174c40bbbca6b3"},
{file = "ruff-0.5.7-py3-none-win_arm64.whl", hash = "sha256:2dca26154ff9571995107221d0aeaad0e75a77b5a682d6236cf89a58c70b76f4"},
{file = "ruff-0.5.7.tar.gz", hash = "sha256:8dfc0a458797f5d9fb622dd0efc52d796f23f0a1493a9527f4e49a550ae9a7e5"},
]
[[package]]
@ -4256,13 +4257,13 @@ xlsx = ["networkx", "openpyxl", "pandas", "xlrd"]
[[package]]
name = "unstructured-client"
version = "0.24.1"
version = "0.25.5"
description = "Python Client SDK for Unstructured API"
optional = false
python-versions = ">=3.8"
files = [
{file = "unstructured-client-0.24.1.tar.gz", hash = "sha256:1bd82a532497783dd77b30ed4e56837d6abfae8cc6d61442acac0bcacbd568c8"},
{file = "unstructured_client-0.24.1-py3-none-any.whl", hash = "sha256:044dab0c3079f908f6adf7088ad44f0e17476b47e2b04e0de608134a482bd0e3"},
{file = "unstructured-client-0.25.5.tar.gz", hash = "sha256:adb97ea56ce65f8b277d5b05f093e9d13a3320ac8dea7265ffa71f5e13ed5f84"},
{file = "unstructured_client-0.25.5-py3-none-any.whl", hash = "sha256:23537fee984e43d06a75f986a73e420a9659cc92010afb8324fbf67c85962eaf"},
]
[package.dependencies]
@ -4472,4 +4473,4 @@ local = ["unstructured"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<4.0"
content-hash = "d95a01d052e3f6175a45c5a589692274300a88782938ed71f835c5f68842d821"
content-hash = "d4f2a9a001f41c52964628e5d82d74e0938c75f75dbc5a08ff9d829c53fb74b9"

@ -14,7 +14,7 @@ license = "MIT"
[tool.poetry.dependencies]
python = ">=3.9,<4.0"
langchain-core = "^0.2.23"
unstructured-client = { version = "^0.24.1" }
unstructured-client = { version = "^0.25.0" }
unstructured = { version = "^0.15.7", optional = true, python = "<3.13", extras = [
"all-docs",
] }
@ -46,7 +46,7 @@ optional = true
optional = true
[tool.poetry.group.lint.dependencies]
ruff = "^0.1.8"
ruff = "^0.5"
[tool.poetry.group.typing.dependencies]
mypy = "^1.7.1"

Loading…
Cancel
Save