From 11fec7d4d1c7aeb3c6b6df2e10f8bd862ab5f6c5 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 7 Jun 2023 22:18:01 -0400 Subject: [PATCH] feat: Add `UnstructuredCSVLoader` for CSV files (#5844) ### Summary Adds an `UnstructuredCSVLoader` for loading CSVs. One advantage of using `UnstructuredCSVLoader` relative to the standard `CSVLoader` is that if you use `UnstructuredCSVLoader` in `"elements"` mode, an HTML representation of the table will be available in the metadata. #### Who can review? @hwchase17 @eyurtsev --- .../document_loaders/examples/csv.ipynb | 211 +++++++++++++++++- langchain/document_loaders/__init__.py | 3 +- langchain/document_loaders/csv_loader.py | 21 +- .../document_loaders/test_csv_loader.py | 15 ++ .../examples/stanley-cups.csv | 5 + 5 files changed, 248 insertions(+), 7 deletions(-) create mode 100644 tests/integration_tests/document_loaders/test_csv_loader.py create mode 100644 tests/integration_tests/examples/stanley-cups.csv diff --git a/docs/modules/indexes/document_loaders/examples/csv.ipynb b/docs/modules/indexes/document_loaders/examples/csv.ipynb index 6b62950ba6..e6555437fc 100644 --- a/docs/modules/indexes/document_loaders/examples/csv.ipynb +++ b/docs/modules/indexes/document_loaders/examples/csv.ipynb @@ -29,7 +29,6 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -45,7 +44,6 @@ "cell_type": "code", "execution_count": 27, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -76,7 +74,6 @@ "cell_type": "code", "execution_count": 28, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -96,7 +93,6 @@ "cell_type": "code", "execution_count": 29, "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } @@ -152,6 +148,211 @@ "source": [ "print(data)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `UnstructuredCSVLoader`\n", + "\n", + "You can also load the table using the `UnstructuredCSVLoader`. One advantage of using `UnstructuredCSVLoader` is that if you use it in `\"elements\"` mode, an HTML representation of the table will be available in the metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.csv_loader import UnstructuredCSVLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredCSVLoader(file_path='example_data/mlb_teams_2012.csv', mode=\"elements\")\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Nationals81.3498
Reds82.2097
Yankees197.9695
Giants117.6294
Braves83.3194
Athletics55.3794
Rangers120.5193
Orioles81.4393
Rays64.1790
Angels154.4989
Tigers132.3088
Cardinals110.3088
Dodgers95.1486
White Sox96.9285
Brewers97.6583
Phillies174.5481
Diamondbacks74.2881
Pirates63.4379
Padres55.2476
Mariners81.9775
Mets93.3574
Blue Jays75.4873
Royals60.9172
Marlins118.0769
Red Sox173.1869
Indians78.4368
Twins94.0866
Rockies78.0664
Cubs88.1961
Astros60.6555
\n" + ] + } + ], + "source": [ + "print(docs[0].metadata[\"text_as_html\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -170,7 +371,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3ec4db3534..83b6330adc 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -19,7 +19,7 @@ from langchain.document_loaders.chatgpt import ChatGPTLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.confluence import ConfluenceLoader from langchain.document_loaders.conllu import CoNLLULoader -from langchain.document_loaders.csv_loader import CSVLoader +from langchain.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.diffbot import DiffbotLoader from langchain.document_loaders.directory import DirectoryLoader @@ -222,6 +222,7 @@ __all__ = [ "TwitterTweetLoader", "UnstructuredAPIFileIOLoader", "UnstructuredAPIFileLoader", + "UnstructuredCSVLoader", "UnstructuredEPubLoader", "UnstructuredEmailLoader", "UnstructuredExcelLoader", diff --git a/langchain/document_loaders/csv_loader.py b/langchain/document_loaders/csv_loader.py index a844f94b1d..3d5e47b1a6 100644 --- a/langchain/document_loaders/csv_loader.py +++ b/langchain/document_loaders/csv_loader.py @@ -1,8 +1,12 @@ import csv -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class CSVLoader(BaseLoader): @@ -61,3 +65,18 @@ class CSVLoader(BaseLoader): docs.append(doc) return docs + + +class UnstructuredCSVLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load CSV files.""" + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + validate_unstructured_version(min_unstructured_version="0.6.8") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + + def _get_elements(self) -> List: + from unstructured.partition.csv import partition_csv + + return partition_csv(filename=self.file_path, **self.unstructured_kwargs) diff --git a/tests/integration_tests/document_loaders/test_csv_loader.py b/tests/integration_tests/document_loaders/test_csv_loader.py new file mode 100644 index 0000000000..ffce01cf17 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_csv_loader.py @@ -0,0 +1,15 @@ +import os +from pathlib import Path + +from langchain.document_loaders import UnstructuredCSVLoader + +EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples" + + +def test_unstructured_csv_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DIRECTORY, "stanley-cups.csv") + loader = UnstructuredCSVLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 diff --git a/tests/integration_tests/examples/stanley-cups.csv b/tests/integration_tests/examples/stanley-cups.csv new file mode 100644 index 0000000000..4414023f00 --- /dev/null +++ b/tests/integration_tests/examples/stanley-cups.csv @@ -0,0 +1,5 @@ +Stanley Cups,, +Team,Location,Stanley Cups +Blues,STL,1 +Flyers,PHI,2 +Maple Leafs,TOR,13 \ No newline at end of file