mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add CSVLoader document loader (#1573)
Simple CSV document loader which wraps `csv` reader, and preps the file with a single `Document` per row. The column header is prepended to each value for context which is useful for context with embedding and semantic search
This commit is contained in:
parent
cdb97f3dfb
commit
30383abb12
126
docs/modules/document_loaders/examples/csv.ipynb
Normal file
126
docs/modules/document_loaders/examples/csv.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -0,0 +1,32 @@
|
||||
"Team", "Payroll (millions)", "Wins"
|
||||
"Nationals", 81.34, 98
|
||||
"Reds", 82.20, 97
|
||||
"Yankees", 197.96, 95
|
||||
"Giants", 117.62, 94
|
||||
"Braves", 83.31, 94
|
||||
"Athletics", 55.37, 94
|
||||
"Rangers", 120.51, 93
|
||||
"Orioles", 81.43, 93
|
||||
"Rays", 64.17, 90
|
||||
"Angels", 154.49, 89
|
||||
"Tigers", 132.30, 88
|
||||
"Cardinals", 110.30, 88
|
||||
"Dodgers", 95.14, 86
|
||||
"White Sox", 96.92, 85
|
||||
"Brewers", 97.65, 83
|
||||
"Phillies", 174.54, 81
|
||||
"Diamondbacks", 74.28, 81
|
||||
"Pirates", 63.43, 79
|
||||
"Padres", 55.24, 76
|
||||
"Mariners", 81.97, 75
|
||||
"Mets", 93.35, 74
|
||||
"Blue Jays", 75.48, 73
|
||||
"Royals", 60.91, 72
|
||||
"Marlins", 118.07, 69
|
||||
"Red Sox", 173.18, 69
|
||||
"Indians", 78.43, 68
|
||||
"Twins", 94.08, 66
|
||||
"Rockies", 78.06, 64
|
||||
"Cubs", 88.19, 61
|
||||
"Astros", 60.65, 55
|
||||
|
|
@ -4,6 +4,7 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.conllu import CoNLLULoader
|
||||
from langchain.document_loaders.csv import CSVLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
@ -96,4 +97,5 @@ __all__ = [
|
||||
"CoNLLULoader",
|
||||
"GoogleApiYoutubeLoader",
|
||||
"GoogleApiClient",
|
||||
"CSVLoader",
|
||||
]
|
||||
|
47
langchain/document_loaders/csv.py
Normal file
47
langchain/document_loaders/csv.py
Normal file
@ -0,0 +1,47 @@
|
||||
from csv import DictReader
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class CSVLoader(BaseLoader):
|
||||
"""Loads a CSV file into a list of documents.
|
||||
|
||||
Each document represents one row of the CSV file. Every row is converted into a
|
||||
key/value pair and outputted to a new line in the document's page_content.
|
||||
|
||||
Output Example:
|
||||
.. code-block:: txt
|
||||
|
||||
column1: value1
|
||||
column2: value2
|
||||
column3: value3
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, csv_args: Optional[Dict] = None):
|
||||
self.file_path = file_path
|
||||
if csv_args is None:
|
||||
self.csv_args = {
|
||||
"delimiter": ",",
|
||||
"quotechar": '"',
|
||||
}
|
||||
else:
|
||||
self.csv_args = csv_args
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
docs = []
|
||||
|
||||
with open(self.file_path, newline="") as csvfile:
|
||||
csv = DictReader(csvfile, **self.csv_args) # type: ignore
|
||||
for row in csv:
|
||||
docs.append(
|
||||
Document(
|
||||
page_content="\n".join(
|
||||
f"{k.strip()}: {v.strip()}" for k, v in row.items()
|
||||
),
|
||||
metadata={"source": self.file_path},
|
||||
)
|
||||
)
|
||||
|
||||
return docs
|
Loading…
Reference in New Issue
Block a user