From 30383abb127d7687a82df6593dd74329d00db730 Mon Sep 17 00:00:00 2001 From: Tim Asp <707699+timothyasp@users.noreply.github.com> Date: Thu, 9 Mar 2023 16:35:18 -0800 Subject: [PATCH] Add CSVLoader document loader (#1573) Simple CSV document loader which wraps `csv` reader, and preps the file with a single `Document` per row. The column header is prepended to each value for context which is useful for context with embedding and semantic search --- .../document_loaders/examples/csv.ipynb | 126 ++++++++++++++++++ .../examples/example_data/mlb_teams_2012.csv | 32 +++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/csv.py | 47 +++++++ 4 files changed, 207 insertions(+) create mode 100644 docs/modules/document_loaders/examples/csv.ipynb create mode 100644 docs/modules/document_loaders/examples/example_data/mlb_teams_2012.csv create mode 100644 langchain/document_loaders/csv.py diff --git a/docs/modules/document_loaders/examples/csv.ipynb b/docs/modules/document_loaders/examples/csv.ipynb new file mode 100644 index 0000000000..72ff46f864 --- /dev/null +++ b/docs/modules/document_loaders/examples/csv.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# CSV Loader\n", + "\n", + "Load csv files with a single row per document." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from langchain.document_loaders.csv import CSVLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "loader = CSVLoader(file_path='./example_data/mlb_teams_2012.csv')\n", + "\n", + "data = loader.load()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\n\"Payroll (millions)\": 81.34\\n\"Wins\": 98', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Reds\\n\"Payroll (millions)\": 82.20\\n\"Wins\": 97', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Yankees\\n\"Payroll (millions)\": 197.96\\n\"Wins\": 95', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Giants\\n\"Payroll (millions)\": 117.62\\n\"Wins\": 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Braves\\n\"Payroll (millions)\": 83.31\\n\"Wins\": 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Athletics\\n\"Payroll (millions)\": 55.37\\n\"Wins\": 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Rangers\\n\"Payroll (millions)\": 120.51\\n\"Wins\": 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Orioles\\n\"Payroll (millions)\": 81.43\\n\"Wins\": 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Rays\\n\"Payroll (millions)\": 64.17\\n\"Wins\": 90', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Angels\\n\"Payroll (millions)\": 154.49\\n\"Wins\": 89', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Tigers\\n\"Payroll (millions)\": 132.30\\n\"Wins\": 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Cardinals\\n\"Payroll (millions)\": 110.30\\n\"Wins\": 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Dodgers\\n\"Payroll (millions)\": 95.14\\n\"Wins\": 86', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: White Sox\\n\"Payroll (millions)\": 96.92\\n\"Wins\": 85', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Brewers\\n\"Payroll (millions)\": 97.65\\n\"Wins\": 83', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Phillies\\n\"Payroll (millions)\": 174.54\\n\"Wins\": 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Diamondbacks\\n\"Payroll (millions)\": 74.28\\n\"Wins\": 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Pirates\\n\"Payroll (millions)\": 63.43\\n\"Wins\": 79', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Padres\\n\"Payroll (millions)\": 55.24\\n\"Wins\": 76', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Mariners\\n\"Payroll (millions)\": 81.97\\n\"Wins\": 75', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Mets\\n\"Payroll (millions)\": 93.35\\n\"Wins\": 74', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Blue Jays\\n\"Payroll (millions)\": 75.48\\n\"Wins\": 73', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Royals\\n\"Payroll (millions)\": 60.91\\n\"Wins\": 72', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Marlins\\n\"Payroll (millions)\": 118.07\\n\"Wins\": 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Red Sox\\n\"Payroll (millions)\": 173.18\\n\"Wins\": 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Indians\\n\"Payroll (millions)\": 78.43\\n\"Wins\": 68', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Twins\\n\"Payroll (millions)\": 94.08\\n\"Wins\": 66', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Rockies\\n\"Payroll (millions)\": 78.06\\n\"Wins\": 64', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Cubs\\n\"Payroll (millions)\": 88.19\\n\"Wins\": 61', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='Team: Astros\\n\"Payroll (millions)\": 60.65\\n\"Wins\": 55', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0)]\n" + ] + } + ], + "source": [ + "print(data)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Customizing the csv parsing and loading\n", + "\n", + "See the [csv module](https://docs.python.org/3/library/csv.html) documentation for more information of what csv args are supported." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [], + "source": [ + "loader = CSVLoader(file_path='./example_data/mlb_teams_2012.csv', csv_args={\n", + " 'delimiter': ',',\n", + " 'quotechar': '\"',\n", + " 'fieldnames': ['MLB Team', 'Payroll in millions', 'Wins']\n", + "})\n", + "\n", + "data = loader.load()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='MLB Team: Team\\nPayroll in millions: \"Payroll (millions)\"\\nWins: \"Wins\"', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Nationals\\nPayroll in millions: 81.34\\nWins: 98', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Reds\\nPayroll in millions: 82.20\\nWins: 97', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Yankees\\nPayroll in millions: 197.96\\nWins: 95', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Giants\\nPayroll in millions: 117.62\\nWins: 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Braves\\nPayroll in millions: 83.31\\nWins: 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Athletics\\nPayroll in millions: 55.37\\nWins: 94', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Rangers\\nPayroll in millions: 120.51\\nWins: 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Orioles\\nPayroll in millions: 81.43\\nWins: 93', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Rays\\nPayroll in millions: 64.17\\nWins: 90', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Angels\\nPayroll in millions: 154.49\\nWins: 89', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Tigers\\nPayroll in millions: 132.30\\nWins: 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Cardinals\\nPayroll in millions: 110.30\\nWins: 88', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Dodgers\\nPayroll in millions: 95.14\\nWins: 86', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: White Sox\\nPayroll in millions: 96.92\\nWins: 85', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Brewers\\nPayroll in millions: 97.65\\nWins: 83', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Phillies\\nPayroll in millions: 174.54\\nWins: 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Diamondbacks\\nPayroll in millions: 74.28\\nWins: 81', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Pirates\\nPayroll in millions: 63.43\\nWins: 79', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Padres\\nPayroll in millions: 55.24\\nWins: 76', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Mariners\\nPayroll in millions: 81.97\\nWins: 75', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Mets\\nPayroll in millions: 93.35\\nWins: 74', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Blue Jays\\nPayroll in millions: 75.48\\nWins: 73', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Royals\\nPayroll in millions: 60.91\\nWins: 72', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Marlins\\nPayroll in millions: 118.07\\nWins: 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Red Sox\\nPayroll in millions: 173.18\\nWins: 69', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Indians\\nPayroll in millions: 78.43\\nWins: 68', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Twins\\nPayroll in millions: 94.08\\nWins: 66', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Rockies\\nPayroll in millions: 78.06\\nWins: 64', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Cubs\\nPayroll in millions: 88.19\\nWins: 61', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0), Document(page_content='MLB Team: Astros\\nPayroll in millions: 60.65\\nWins: 55', lookup_str='', metadata={'source': './example_data/mlb_teams_2012.csv'}, lookup_index=0)]\n" + ] + } + ], + "source": [ + "print(data)" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/modules/document_loaders/examples/example_data/mlb_teams_2012.csv b/docs/modules/document_loaders/examples/example_data/mlb_teams_2012.csv new file mode 100644 index 0000000000..b22ae961a1 --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/mlb_teams_2012.csv @@ -0,0 +1,32 @@ +"Team", "Payroll (millions)", "Wins" +"Nationals", 81.34, 98 +"Reds", 82.20, 97 +"Yankees", 197.96, 95 +"Giants", 117.62, 94 +"Braves", 83.31, 94 +"Athletics", 55.37, 94 +"Rangers", 120.51, 93 +"Orioles", 81.43, 93 +"Rays", 64.17, 90 +"Angels", 154.49, 89 +"Tigers", 132.30, 88 +"Cardinals", 110.30, 88 +"Dodgers", 95.14, 86 +"White Sox", 96.92, 85 +"Brewers", 97.65, 83 +"Phillies", 174.54, 81 +"Diamondbacks", 74.28, 81 +"Pirates", 63.43, 79 +"Padres", 55.24, 76 +"Mariners", 81.97, 75 +"Mets", 93.35, 74 +"Blue Jays", 75.48, 73 +"Royals", 60.91, 72 +"Marlins", 118.07, 69 +"Red Sox", 173.18, 69 +"Indians", 78.43, 68 +"Twins", 94.08, 66 +"Rockies", 78.06, 64 +"Cubs", 88.19, 61 +"Astros", 60.65, 55 + diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index e6d6cc9b9b..c985acfe7b 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -4,6 +4,7 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.azlyrics import AZLyricsLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.conllu import CoNLLULoader +from langchain.document_loaders.csv import CSVLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader @@ -96,4 +97,5 @@ __all__ = [ "CoNLLULoader", "GoogleApiYoutubeLoader", "GoogleApiClient", + "CSVLoader", ] diff --git a/langchain/document_loaders/csv.py b/langchain/document_loaders/csv.py new file mode 100644 index 0000000000..af98db4652 --- /dev/null +++ b/langchain/document_loaders/csv.py @@ -0,0 +1,47 @@ +from csv import DictReader +from typing import Dict, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class CSVLoader(BaseLoader): + """Loads a CSV file into a list of documents. + + Each document represents one row of the CSV file. Every row is converted into a + key/value pair and outputted to a new line in the document's page_content. + + Output Example: + .. code-block:: txt + + column1: value1 + column2: value2 + column3: value3 + """ + + def __init__(self, file_path: str, csv_args: Optional[Dict] = None): + self.file_path = file_path + if csv_args is None: + self.csv_args = { + "delimiter": ",", + "quotechar": '"', + } + else: + self.csv_args = csv_args + + def load(self) -> List[Document]: + docs = [] + + with open(self.file_path, newline="") as csvfile: + csv = DictReader(csvfile, **self.csv_args) # type: ignore + for row in csv: + docs.append( + Document( + page_content="\n".join( + f"{k.strip()}: {v.strip()}" for k, v in row.items() + ), + metadata={"source": self.file_path}, + ) + ) + + return docs