mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add CSVLoader document loader (#1573)
Simple CSV document loader which wraps `csv` reader, and preps the file with a single `Document` per row. The column header is prepended to each value for context which is useful for context with embedding and semantic search
This commit is contained in:
parent
cdb97f3dfb
commit
30383abb12
126
docs/modules/document_loaders/examples/csv.ipynb
Normal file
126
docs/modules/document_loaders/examples/csv.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -0,0 +1,32 @@
|
|||||||
|
"Team", "Payroll (millions)", "Wins"
|
||||||
|
"Nationals", 81.34, 98
|
||||||
|
"Reds", 82.20, 97
|
||||||
|
"Yankees", 197.96, 95
|
||||||
|
"Giants", 117.62, 94
|
||||||
|
"Braves", 83.31, 94
|
||||||
|
"Athletics", 55.37, 94
|
||||||
|
"Rangers", 120.51, 93
|
||||||
|
"Orioles", 81.43, 93
|
||||||
|
"Rays", 64.17, 90
|
||||||
|
"Angels", 154.49, 89
|
||||||
|
"Tigers", 132.30, 88
|
||||||
|
"Cardinals", 110.30, 88
|
||||||
|
"Dodgers", 95.14, 86
|
||||||
|
"White Sox", 96.92, 85
|
||||||
|
"Brewers", 97.65, 83
|
||||||
|
"Phillies", 174.54, 81
|
||||||
|
"Diamondbacks", 74.28, 81
|
||||||
|
"Pirates", 63.43, 79
|
||||||
|
"Padres", 55.24, 76
|
||||||
|
"Mariners", 81.97, 75
|
||||||
|
"Mets", 93.35, 74
|
||||||
|
"Blue Jays", 75.48, 73
|
||||||
|
"Royals", 60.91, 72
|
||||||
|
"Marlins", 118.07, 69
|
||||||
|
"Red Sox", 173.18, 69
|
||||||
|
"Indians", 78.43, 68
|
||||||
|
"Twins", 94.08, 66
|
||||||
|
"Rockies", 78.06, 64
|
||||||
|
"Cubs", 88.19, 61
|
||||||
|
"Astros", 60.65, 55
|
||||||
|
|
|
@ -4,6 +4,7 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
|||||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||||
from langchain.document_loaders.conllu import CoNLLULoader
|
from langchain.document_loaders.conllu import CoNLLULoader
|
||||||
|
from langchain.document_loaders.csv import CSVLoader
|
||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||||
@ -96,4 +97,5 @@ __all__ = [
|
|||||||
"CoNLLULoader",
|
"CoNLLULoader",
|
||||||
"GoogleApiYoutubeLoader",
|
"GoogleApiYoutubeLoader",
|
||||||
"GoogleApiClient",
|
"GoogleApiClient",
|
||||||
|
"CSVLoader",
|
||||||
]
|
]
|
||||||
|
47
langchain/document_loaders/csv.py
Normal file
47
langchain/document_loaders/csv.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from csv import DictReader
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class CSVLoader(BaseLoader):
|
||||||
|
"""Loads a CSV file into a list of documents.
|
||||||
|
|
||||||
|
Each document represents one row of the CSV file. Every row is converted into a
|
||||||
|
key/value pair and outputted to a new line in the document's page_content.
|
||||||
|
|
||||||
|
Output Example:
|
||||||
|
.. code-block:: txt
|
||||||
|
|
||||||
|
column1: value1
|
||||||
|
column2: value2
|
||||||
|
column3: value3
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str, csv_args: Optional[Dict] = None):
|
||||||
|
self.file_path = file_path
|
||||||
|
if csv_args is None:
|
||||||
|
self.csv_args = {
|
||||||
|
"delimiter": ",",
|
||||||
|
"quotechar": '"',
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
self.csv_args = csv_args
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
docs = []
|
||||||
|
|
||||||
|
with open(self.file_path, newline="") as csvfile:
|
||||||
|
csv = DictReader(csvfile, **self.csv_args) # type: ignore
|
||||||
|
for row in csv:
|
||||||
|
docs.append(
|
||||||
|
Document(
|
||||||
|
page_content="\n".join(
|
||||||
|
f"{k.strip()}: {v.strip()}" for k, v in row.items()
|
||||||
|
),
|
||||||
|
metadata={"source": self.file_path},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return docs
|
Loading…
Reference in New Issue
Block a user