feat: Add `UnstructuredTSVLoader` (#7367)

### Summary Adds an `UnstructuredTSVLoader` for TSV files. Also updates the doc strings for `UnstructuredCSV` and `UnstructuredExcel` loaders. ### Testing ```python from langchain.document_loaders.tsv import UnstructuredTSVLoader loader = UnstructuredTSVLoader( file_path="example_data/mlb_teams_2012.csv", mode="elements" ) docs = loader.load() ```
1 year ago · bcab894f4e
parent 490f4a9ff0
commit bcab894f4e
8 changed files with 273 additions and 2 deletions
--- a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/stanley-cups.tsv
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/stanley-cups.tsv
@ -0,0 +1,5 @@
+Stanley Cups		
+Team	Location	Stanley Cups
+Blues	STL	1
+Flyers	PHI	2
+Maple Leafs	TOR	13
--- a/docs/extras/modules/data_connection/document_loaders/integrations/tsv.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/tsv.ipynb
@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TSV\n",
+    "\n",
+    ">A [tab-separated values (TSV)](https://en.wikipedia.org/wiki/Tab-separated_values) file is a simple, text-based file format for storing tabular data.[3] Records are separated by newlines, and values within a record are separated by tab characters."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## `UnstructuredTSVLoader`\n",
+    "\n",
+    "You can also load the table using the `UnstructuredTSVLoader`. One advantage of using `UnstructuredTSVLoader` is that if you use it in `\"elements\"` mode, an HTML representation of the table will be available in the metadata."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders.tsv import UnstructuredTSVLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredTSVLoader(\n",
+    "    file_path=\"example_data/mlb_teams_2012.csv\", mode=\"elements\"\n",
+    ")\n",
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<table border=\"1\" class=\"dataframe\">\n",
+      "  <tbody>\n",
+      "    <tr>\n",
+      "      <td>Nationals,     81.34, 98</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Reds,          82.20, 97</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Yankees,      197.96, 95</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Giants,       117.62, 94</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Braves,        83.31, 94</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Athletics,     55.37, 94</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Rangers,      120.51, 93</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Orioles,       81.43, 93</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Rays,          64.17, 90</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Angels,       154.49, 89</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Tigers,       132.30, 88</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Cardinals,    110.30, 88</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Dodgers,       95.14, 86</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>White Sox,     96.92, 85</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Brewers,       97.65, 83</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Phillies,     174.54, 81</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Diamondbacks,  74.28, 81</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Pirates,       63.43, 79</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Padres,        55.24, 76</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Mariners,      81.97, 75</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Mets,          93.35, 74</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Blue Jays,     75.48, 73</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Royals,        60.91, 72</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Marlins,      118.07, 69</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Red Sox,      173.18, 69</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Indians,       78.43, 68</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Twins,         94.08, 66</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Rockies,       78.06, 64</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Cubs,          88.19, 61</td>\n",
+      "    </tr>\n",
+      "    <tr>\n",
+      "      <td>Astros,        60.65, 55</td>\n",
+      "    </tr>\n",
+      "  </tbody>\n",
+      "</table>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(docs[0].metadata[\"text_as_html\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -122,6 +122,7 @@ from langchain.document_loaders.text import TextLoader
 from langchain.document_loaders.tomarkdown import ToMarkdownLoader
 from langchain.document_loaders.toml import TomlLoader
 from langchain.document_loaders.trello import TrelloLoader
+from langchain.document_loaders.tsv import UnstructuredTSVLoader
 from langchain.document_loaders.twitter import TwitterTweetLoader
 from langchain.document_loaders.unstructured import (
    UnstructuredAPIFileIOLoader,
@ -278,6 +279,7 @@ __all__ = [
    "UnstructuredPowerPointLoader",
    "UnstructuredRSTLoader",
    "UnstructuredRTFLoader",
+    "UnstructuredTSVLoader",
    "UnstructuredURLLoader",
    "UnstructuredWordDocumentLoader",
    "UnstructuredXMLLoader",
--- a/langchain/document_loaders/csv_loader.py
+++ b/langchain/document_loaders/csv_loader.py
@ -78,7 +78,21 @@ class CSVLoader(BaseLoader):


 class UnstructuredCSVLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load CSV files."""
+    """Loader that uses unstructured to load CSV files. Like other
+    Unstructured loaders, UnstructuredCSVLoader can be used in both
+    "single" and "elements" mode. If you use the loader in "elements"
+    mode, the CSV file will be a single Unstructured Table element.
+    If you use the loader in "elements" mode, an HTML representation
+    of the table will be available in the "text_as_html" key in the
+    document metadata.
+
+    Examples
+    --------
+    from langchain.document_loaders.csv_loader import UnstructuredCSVLoader
+
+    loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
+    docs = loader.load()
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
--- a/langchain/document_loaders/excel.py
+++ b/langchain/document_loaders/excel.py
@ -8,7 +8,21 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredExcelLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load Microsoft Excel files."""
+    """Loader that uses unstructured to load Excel files. Like other
+    Unstructured loaders, UnstructuredExcelLoader can be used in both
+    "single" and "elements" mode. If you use the loader in "elements"
+    mode, each sheet in the Excel file will be a an Unstructured Table
+    element. If you use the loader in "elements" mode, an
+    HTML representation of the table will be available in the
+    "text_as_html" key in the document metadata.
+
+    Examples
+    --------
+    from langchain.document_loaders.excel import UnstructuredExcelLoader
+
+    loader = UnstructuredExcelLoader("stanley-cups.xlsd", mode="elements")
+    docs = loader.load()
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
--- a/langchain/document_loaders/tsv.py
+++ b/langchain/document_loaders/tsv.py
@ -0,0 +1,35 @@
+from typing import Any, List
+
+from langchain.document_loaders.unstructured import (
+    UnstructuredFileLoader,
+    validate_unstructured_version,
+)
+
+
+class UnstructuredTSVLoader(UnstructuredFileLoader):
+    """Loader that uses unstructured to load TSV files. Like other
+    Unstructured loaders, UnstructuredTSVLoader can be used in both
+    "single" and "elements" mode. If you use the loader in "elements"
+    mode, the TSV file will be a single Unstructured Table element.
+    If you use the loader in "elements" mode, an HTML representation
+    of the table will be available in the "text_as_html" key in the
+    document metadata.
+
+    Examples
+    --------
+    from langchain.document_loaders.tsv import UnstructuredTSVLoader
+
+    loader = UnstructuredTSVLoader("stanley-cups.tsv", mode="elements")
+    docs = loader.load()
+    """
+
+    def __init__(
+        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+    ):
+        validate_unstructured_version(min_unstructured_version="0.7.6")
+        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
+
+    def _get_elements(self) -> List:
+        from unstructured.partition.tsv import partition_tsv
+
+        return partition_tsv(filename=self.file_path, **self.unstructured_kwargs)
--- a/tests/integration_tests/document_loaders/test_tsv.py
+++ b/tests/integration_tests/document_loaders/test_tsv.py
@ -0,0 +1,15 @@
+import os
+from pathlib import Path
+
+from langchain.document_loaders import UnstructuredTSVLoader
+
+EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples"
+
+
+def test_unstructured_tsv_loader() -> None:
+    """Test unstructured loader."""
+    file_path = os.path.join(EXAMPLE_DIRECTORY, "stanley-cups.tsv")
+    loader = UnstructuredTSVLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
--- a/tests/integration_tests/examples/stanley-cups.tsv
+++ b/tests/integration_tests/examples/stanley-cups.tsv
@ -0,0 +1,5 @@
+Stanley Cups		
+Team	Location	Stanley Cups
+Blues	STL	1
+Flyers	PHI	2
+Maple Leafs	TOR	13