feat: Add `UnstructuredTSVLoader` (#7367)

### Summary

Adds an `UnstructuredTSVLoader` for TSV files. Also updates the doc
strings for `UnstructuredCSV` and `UnstructuredExcel` loaders.

### Testing

```python
from langchain.document_loaders.tsv import UnstructuredTSVLoader

loader = UnstructuredTSVLoader(
    file_path="example_data/mlb_teams_2012.csv", mode="elements"
)
docs = loader.load()
```
pull/7356/head
Matt Robinson 1 year ago committed by GitHub
parent 490f4a9ff0
commit bcab894f4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,5 @@
Stanley Cups
Team Location Stanley Cups
Blues STL 1
Flyers PHI 2
Maple Leafs TOR 13
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13

@ -0,0 +1,181 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TSV\n",
"\n",
">A [tab-separated values (TSV)](https://en.wikipedia.org/wiki/Tab-separated_values) file is a simple, text-based file format for storing tabular data.[3] Records are separated by newlines, and values within a record are separated by tab characters."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## `UnstructuredTSVLoader`\n",
"\n",
"You can also load the table using the `UnstructuredTSVLoader`. One advantage of using `UnstructuredTSVLoader` is that if you use it in `\"elements\"` mode, an HTML representation of the table will be available in the metadata."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders.tsv import UnstructuredTSVLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredTSVLoader(\n",
" file_path=\"example_data/mlb_teams_2012.csv\", mode=\"elements\"\n",
")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<table border=\"1\" class=\"dataframe\">\n",
" <tbody>\n",
" <tr>\n",
" <td>Nationals, 81.34, 98</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Reds, 82.20, 97</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Yankees, 197.96, 95</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Giants, 117.62, 94</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Braves, 83.31, 94</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Athletics, 55.37, 94</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Rangers, 120.51, 93</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Orioles, 81.43, 93</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Rays, 64.17, 90</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Angels, 154.49, 89</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Tigers, 132.30, 88</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Cardinals, 110.30, 88</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Dodgers, 95.14, 86</td>\n",
" </tr>\n",
" <tr>\n",
" <td>White Sox, 96.92, 85</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Brewers, 97.65, 83</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Phillies, 174.54, 81</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Diamondbacks, 74.28, 81</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Pirates, 63.43, 79</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Padres, 55.24, 76</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Mariners, 81.97, 75</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Mets, 93.35, 74</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Blue Jays, 75.48, 73</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Royals, 60.91, 72</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Marlins, 118.07, 69</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Red Sox, 173.18, 69</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Indians, 78.43, 68</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Twins, 94.08, 66</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Rockies, 78.06, 64</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Cubs, 88.19, 61</td>\n",
" </tr>\n",
" <tr>\n",
" <td>Astros, 60.65, 55</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
]
}
],
"source": [
"print(docs[0].metadata[\"text_as_html\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -122,6 +122,7 @@ from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.tomarkdown import ToMarkdownLoader
from langchain.document_loaders.toml import TomlLoader
from langchain.document_loaders.trello import TrelloLoader
from langchain.document_loaders.tsv import UnstructuredTSVLoader
from langchain.document_loaders.twitter import TwitterTweetLoader
from langchain.document_loaders.unstructured import (
UnstructuredAPIFileIOLoader,
@ -278,6 +279,7 @@ __all__ = [
"UnstructuredPowerPointLoader",
"UnstructuredRSTLoader",
"UnstructuredRTFLoader",
"UnstructuredTSVLoader",
"UnstructuredURLLoader",
"UnstructuredWordDocumentLoader",
"UnstructuredXMLLoader",

@ -78,7 +78,21 @@ class CSVLoader(BaseLoader):
class UnstructuredCSVLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load CSV files."""
"""Loader that uses unstructured to load CSV files. Like other
Unstructured loaders, UnstructuredCSVLoader can be used in both
"single" and "elements" mode. If you use the loader in "elements"
mode, the CSV file will be a single Unstructured Table element.
If you use the loader in "elements" mode, an HTML representation
of the table will be available in the "text_as_html" key in the
document metadata.
Examples
--------
from langchain.document_loaders.csv_loader import UnstructuredCSVLoader
loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
docs = loader.load()
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any

@ -8,7 +8,21 @@ from langchain.document_loaders.unstructured import (
class UnstructuredExcelLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load Microsoft Excel files."""
"""Loader that uses unstructured to load Excel files. Like other
Unstructured loaders, UnstructuredExcelLoader can be used in both
"single" and "elements" mode. If you use the loader in "elements"
mode, each sheet in the Excel file will be a an Unstructured Table
element. If you use the loader in "elements" mode, an
HTML representation of the table will be available in the
"text_as_html" key in the document metadata.
Examples
--------
from langchain.document_loaders.excel import UnstructuredExcelLoader
loader = UnstructuredExcelLoader("stanley-cups.xlsd", mode="elements")
docs = loader.load()
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any

@ -0,0 +1,35 @@
from typing import Any, List
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class UnstructuredTSVLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load TSV files. Like other
Unstructured loaders, UnstructuredTSVLoader can be used in both
"single" and "elements" mode. If you use the loader in "elements"
mode, the TSV file will be a single Unstructured Table element.
If you use the loader in "elements" mode, an HTML representation
of the table will be available in the "text_as_html" key in the
document metadata.
Examples
--------
from langchain.document_loaders.tsv import UnstructuredTSVLoader
loader = UnstructuredTSVLoader("stanley-cups.tsv", mode="elements")
docs = loader.load()
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
validate_unstructured_version(min_unstructured_version="0.7.6")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.tsv import partition_tsv
return partition_tsv(filename=self.file_path, **self.unstructured_kwargs)

@ -0,0 +1,15 @@
import os
from pathlib import Path
from langchain.document_loaders import UnstructuredTSVLoader
EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples"
def test_unstructured_tsv_loader() -> None:
"""Test unstructured loader."""
file_path = os.path.join(EXAMPLE_DIRECTORY, "stanley-cups.tsv")
loader = UnstructuredTSVLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1

@ -0,0 +1,5 @@
Stanley Cups
Team Location Stanley Cups
Blues STL 1
Flyers PHI 2
Maple Leafs TOR 13
1 Stanley Cups
2 Team Location Stanley Cups
3 Blues STL 1
4 Flyers PHI 2
5 Maple Leafs TOR 13
Loading…
Cancel
Save