From bcab894f4ed4d9cee502b7fac30b8331fa38d6ec Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Mon, 10 Jul 2023 03:07:10 -0400 Subject: [PATCH] feat: Add `UnstructuredTSVLoader` (#7367) ### Summary Adds an `UnstructuredTSVLoader` for TSV files. Also updates the doc strings for `UnstructuredCSV` and `UnstructuredExcel` loaders. ### Testing ```python from langchain.document_loaders.tsv import UnstructuredTSVLoader loader = UnstructuredTSVLoader( file_path="example_data/mlb_teams_2012.csv", mode="elements" ) docs = loader.load() ``` --- .../example_data/stanley-cups.tsv | 5 + .../document_loaders/integrations/tsv.ipynb | 181 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/csv_loader.py | 16 +- langchain/document_loaders/excel.py | 16 +- langchain/document_loaders/tsv.py | 35 ++++ .../document_loaders/test_tsv.py | 15 ++ .../examples/stanley-cups.tsv | 5 + 8 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/example_data/stanley-cups.tsv create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/tsv.ipynb create mode 100644 langchain/document_loaders/tsv.py create mode 100644 tests/integration_tests/document_loaders/test_tsv.py create mode 100644 tests/integration_tests/examples/stanley-cups.tsv diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/stanley-cups.tsv b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/stanley-cups.tsv new file mode 100644 index 0000000000..314be466da --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/stanley-cups.tsv @@ -0,0 +1,5 @@ +Stanley Cups +Team Location Stanley Cups +Blues STL 1 +Flyers PHI 2 +Maple Leafs TOR 13 diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/tsv.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/tsv.ipynb new file mode 100644 index 0000000000..f959ab6b74 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/tsv.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TSV\n", + "\n", + ">A [tab-separated values (TSV)](https://en.wikipedia.org/wiki/Tab-separated_values) file is a simple, text-based file format for storing tabular data.[3] Records are separated by newlines, and values within a record are separated by tab characters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `UnstructuredTSVLoader`\n", + "\n", + "You can also load the table using the `UnstructuredTSVLoader`. One advantage of using `UnstructuredTSVLoader` is that if you use it in `\"elements\"` mode, an HTML representation of the table will be available in the metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.tsv import UnstructuredTSVLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredTSVLoader(\n", + " file_path=\"example_data/mlb_teams_2012.csv\", mode=\"elements\"\n", + ")\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Nationals, 81.34, 98
Reds, 82.20, 97
Yankees, 197.96, 95
Giants, 117.62, 94
Braves, 83.31, 94
Athletics, 55.37, 94
Rangers, 120.51, 93
Orioles, 81.43, 93
Rays, 64.17, 90
Angels, 154.49, 89
Tigers, 132.30, 88
Cardinals, 110.30, 88
Dodgers, 95.14, 86
White Sox, 96.92, 85
Brewers, 97.65, 83
Phillies, 174.54, 81
Diamondbacks, 74.28, 81
Pirates, 63.43, 79
Padres, 55.24, 76
Mariners, 81.97, 75
Mets, 93.35, 74
Blue Jays, 75.48, 73
Royals, 60.91, 72
Marlins, 118.07, 69
Red Sox, 173.18, 69
Indians, 78.43, 68
Twins, 94.08, 66
Rockies, 78.06, 64
Cubs, 88.19, 61
Astros, 60.65, 55
\n" + ] + } + ], + "source": [ + "print(docs[0].metadata[\"text_as_html\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index b6cba9d6f9..fd32632354 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -122,6 +122,7 @@ from langchain.document_loaders.text import TextLoader from langchain.document_loaders.tomarkdown import ToMarkdownLoader from langchain.document_loaders.toml import TomlLoader from langchain.document_loaders.trello import TrelloLoader +from langchain.document_loaders.tsv import UnstructuredTSVLoader from langchain.document_loaders.twitter import TwitterTweetLoader from langchain.document_loaders.unstructured import ( UnstructuredAPIFileIOLoader, @@ -278,6 +279,7 @@ __all__ = [ "UnstructuredPowerPointLoader", "UnstructuredRSTLoader", "UnstructuredRTFLoader", + "UnstructuredTSVLoader", "UnstructuredURLLoader", "UnstructuredWordDocumentLoader", "UnstructuredXMLLoader", diff --git a/langchain/document_loaders/csv_loader.py b/langchain/document_loaders/csv_loader.py index 17bb84df26..9a5289966b 100644 --- a/langchain/document_loaders/csv_loader.py +++ b/langchain/document_loaders/csv_loader.py @@ -78,7 +78,21 @@ class CSVLoader(BaseLoader): class UnstructuredCSVLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load CSV files.""" + """Loader that uses unstructured to load CSV files. Like other + Unstructured loaders, UnstructuredCSVLoader can be used in both + "single" and "elements" mode. If you use the loader in "elements" + mode, the CSV file will be a single Unstructured Table element. + If you use the loader in "elements" mode, an HTML representation + of the table will be available in the "text_as_html" key in the + document metadata. + + Examples + -------- + from langchain.document_loaders.csv_loader import UnstructuredCSVLoader + + loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements") + docs = loader.load() + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any diff --git a/langchain/document_loaders/excel.py b/langchain/document_loaders/excel.py index 54e96bf269..946430f0c6 100644 --- a/langchain/document_loaders/excel.py +++ b/langchain/document_loaders/excel.py @@ -8,7 +8,21 @@ from langchain.document_loaders.unstructured import ( class UnstructuredExcelLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load Microsoft Excel files.""" + """Loader that uses unstructured to load Excel files. Like other + Unstructured loaders, UnstructuredExcelLoader can be used in both + "single" and "elements" mode. If you use the loader in "elements" + mode, each sheet in the Excel file will be a an Unstructured Table + element. If you use the loader in "elements" mode, an + HTML representation of the table will be available in the + "text_as_html" key in the document metadata. + + Examples + -------- + from langchain.document_loaders.excel import UnstructuredExcelLoader + + loader = UnstructuredExcelLoader("stanley-cups.xlsd", mode="elements") + docs = loader.load() + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any diff --git a/langchain/document_loaders/tsv.py b/langchain/document_loaders/tsv.py new file mode 100644 index 0000000000..5a5c7b6d7c --- /dev/null +++ b/langchain/document_loaders/tsv.py @@ -0,0 +1,35 @@ +from typing import Any, List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) + + +class UnstructuredTSVLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load TSV files. Like other + Unstructured loaders, UnstructuredTSVLoader can be used in both + "single" and "elements" mode. If you use the loader in "elements" + mode, the TSV file will be a single Unstructured Table element. + If you use the loader in "elements" mode, an HTML representation + of the table will be available in the "text_as_html" key in the + document metadata. + + Examples + -------- + from langchain.document_loaders.tsv import UnstructuredTSVLoader + + loader = UnstructuredTSVLoader("stanley-cups.tsv", mode="elements") + docs = loader.load() + """ + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + validate_unstructured_version(min_unstructured_version="0.7.6") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + + def _get_elements(self) -> List: + from unstructured.partition.tsv import partition_tsv + + return partition_tsv(filename=self.file_path, **self.unstructured_kwargs) diff --git a/tests/integration_tests/document_loaders/test_tsv.py b/tests/integration_tests/document_loaders/test_tsv.py new file mode 100644 index 0000000000..2834fc61c3 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_tsv.py @@ -0,0 +1,15 @@ +import os +from pathlib import Path + +from langchain.document_loaders import UnstructuredTSVLoader + +EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples" + + +def test_unstructured_tsv_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DIRECTORY, "stanley-cups.tsv") + loader = UnstructuredTSVLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 diff --git a/tests/integration_tests/examples/stanley-cups.tsv b/tests/integration_tests/examples/stanley-cups.tsv new file mode 100644 index 0000000000..314be466da --- /dev/null +++ b/tests/integration_tests/examples/stanley-cups.tsv @@ -0,0 +1,5 @@ +Stanley Cups +Team Location Stanley Cups +Blues STL 1 +Flyers PHI 2 +Maple Leafs TOR 13