From e4224a396b07b29dab1c03ffa46efcc58ffa68e5 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Sat, 10 Jun 2023 18:24:42 -0500 Subject: [PATCH] feat: Add `UnstructuredXMLLoader` for `.xml` files (#5955) # Unstructured XML Loader Adds an `UnstructuredXMLLoader` class for .xml files. Works with unstructured>=0.6.7. A plain text representation of the text with the XML tags will be available under the `page_content` attribute in the doc. ### Testing ```python from langchain.document_loaders import UnstructuredXMLLoader loader = UnstructuredXMLLoader( "example_data/factbook.xml", ) docs = loader.load() ``` ## Who can review? @hwchase17 @eyurtsev --- .../examples/example_data/factbook.xml | 27 +++++++ .../document_loaders/examples/xml.ipynb | 78 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/xml.py | 22 ++++++ .../document_loaders/test_xml.py | 15 ++++ tests/integration_tests/examples/factbook.xml | 27 +++++++ 6 files changed, 171 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/example_data/factbook.xml create mode 100644 docs/modules/indexes/document_loaders/examples/xml.ipynb create mode 100644 langchain/document_loaders/xml.py create mode 100644 tests/integration_tests/document_loaders/test_xml.py create mode 100644 tests/integration_tests/examples/factbook.xml diff --git a/docs/modules/indexes/document_loaders/examples/example_data/factbook.xml b/docs/modules/indexes/document_loaders/examples/example_data/factbook.xml new file mode 100644 index 00000000..d059ee9d --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/example_data/factbook.xml @@ -0,0 +1,27 @@ + + + + United States + Washington, DC + Joe Biden + Baseball + + + Canada + Ottawa + Justin Trudeau + Hockey + + + France + Paris + Emmanuel Macron + Soccer + + + Trinidad & Tobado + Port of Spain + Keith Rowley + Track & Field + + diff --git a/docs/modules/indexes/document_loaders/examples/xml.ipynb b/docs/modules/indexes/document_loaders/examples/xml.ipynb new file mode 100644 index 00000000..5c959868 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/xml.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "22a849cc", + "metadata": {}, + "source": [ + "# XML\n", + "\n", + "The `UnstructuredXMLLoader` is used to load `XML` files. The loader works with `.xml` files. The page content will be the text extracted from the XML tags." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e6616e3a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredXMLLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a654e4d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='United States\\n\\nWashington, DC\\n\\nJoe Biden\\n\\nBaseball\\n\\nCanada\\n\\nOttawa\\n\\nJustin Trudeau\\n\\nHockey\\n\\nFrance\\n\\nParis\\n\\nEmmanuel Macron\\n\\nSoccer\\n\\nTrinidad & Tobado\\n\\nPort of Spain\\n\\nKeith Rowley\\n\\nTrack & Field', metadata={'source': 'example_data/factbook.xml'})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader = UnstructuredXMLLoader(\n", + " \"example_data/factbook.xml\",\n", + ")\n", + "docs = loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a54342bb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index d533159a..87d2335e 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -121,6 +121,7 @@ from langchain.document_loaders.word_document import ( Docx2txtLoader, UnstructuredWordDocumentLoader, ) +from langchain.document_loaders.xml import UnstructuredXMLLoader from langchain.document_loaders.youtube import ( GoogleApiClient, GoogleApiYoutubeLoader, @@ -242,6 +243,7 @@ __all__ = [ "UnstructuredRTFLoader", "UnstructuredURLLoader", "UnstructuredWordDocumentLoader", + "UnstructuredXMLLoader", "WeatherDataLoader", "WebBaseLoader", "WhatsAppChatLoader", diff --git a/langchain/document_loaders/xml.py b/langchain/document_loaders/xml.py new file mode 100644 index 00000000..78156ee2 --- /dev/null +++ b/langchain/document_loaders/xml.py @@ -0,0 +1,22 @@ +"""Loader that loads Microsoft Excel files.""" +from typing import Any, List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) + + +class UnstructuredXMLLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load XML files.""" + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + validate_unstructured_version(min_unstructured_version="0.6.7") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + + def _get_elements(self) -> List: + from unstructured.partition.xml import partition_xml + + return partition_xml(filename=self.file_path, **self.unstructured_kwargs) diff --git a/tests/integration_tests/document_loaders/test_xml.py b/tests/integration_tests/document_loaders/test_xml.py new file mode 100644 index 00000000..a4ea69e7 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_xml.py @@ -0,0 +1,15 @@ +import os +from pathlib import Path + +from langchain.document_loaders import UnstructuredXMLLoader + +EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples" + + +def test_unstructured_xml_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DIRECTORY, "factbook.xml") + loader = UnstructuredXMLLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 diff --git a/tests/integration_tests/examples/factbook.xml b/tests/integration_tests/examples/factbook.xml new file mode 100644 index 00000000..d059ee9d --- /dev/null +++ b/tests/integration_tests/examples/factbook.xml @@ -0,0 +1,27 @@ + + + + United States + Washington, DC + Joe Biden + Baseball + + + Canada + Ottawa + Justin Trudeau + Hockey + + + France + Paris + Emmanuel Macron + Soccer + + + Trinidad & Tobado + Port of Spain + Keith Rowley + Track & Field + +