mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
fc93bed8c4
- **Bug code**: In langchain_community/document_loaders/csv_loader.py:100 - **Description**: currently, when 'CSVLoader' reads the column as None in the 'csv' file, it will report an error because the 'CSVLoader' does not verify whether the column is of str type and does not consider how to handle the corresponding 'row_data' when the column is' None 'in the csv. This pr provides a solution. - **Issue:** Fix #20699 - **thinking:** 1. Refer to the processing method for 'langchain_community/document_loaders/csv_loader.py:100' when **'v'** equals'None', and apply the same method to '**k**'. (Reference`csv.DictReader` ,**'k'** will only be None when ` len(columns) < len(number_row_data)` is established) 2. **‘k’** equals None only holds when it is the last column, and its corresponding **'v'** type is a list. Therefore, I referred to the data format in 'Document' and used ',' to concatenated the elements in the list.(But I'm not sure if you accept this form, if you have any other ideas, communicate) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
152 lines
5.8 KiB
Python
152 lines
5.8 KiB
Python
import csv
|
|
from io import TextIOWrapper
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
from langchain_community.document_loaders.helpers import detect_file_encodings
|
|
from langchain_community.document_loaders.unstructured import (
|
|
UnstructuredFileLoader,
|
|
validate_unstructured_version,
|
|
)
|
|
|
|
|
|
class CSVLoader(BaseLoader):
|
|
"""Load a `CSV` file into a list of Documents.
|
|
|
|
Each document represents one row of the CSV file. Every row is converted into a
|
|
key/value pair and outputted to a new line in the document's page_content.
|
|
|
|
The source for each document loaded from csv is set to the value of the
|
|
`file_path` argument for all documents by default.
|
|
You can override this by setting the `source_column` argument to the
|
|
name of a column in the CSV file.
|
|
The source of each document will then be set to the value of the column
|
|
with the name specified in `source_column`.
|
|
|
|
Output Example:
|
|
.. code-block:: txt
|
|
|
|
column1: value1
|
|
column2: value2
|
|
column3: value3
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Union[str, Path],
|
|
source_column: Optional[str] = None,
|
|
metadata_columns: Sequence[str] = (),
|
|
csv_args: Optional[Dict] = None,
|
|
encoding: Optional[str] = None,
|
|
autodetect_encoding: bool = False,
|
|
):
|
|
"""
|
|
|
|
Args:
|
|
file_path: The path to the CSV file.
|
|
source_column: The name of the column in the CSV file to use as the source.
|
|
Optional. Defaults to None.
|
|
metadata_columns: A sequence of column names to use as metadata. Optional.
|
|
csv_args: A dictionary of arguments to pass to the csv.DictReader.
|
|
Optional. Defaults to None.
|
|
encoding: The encoding of the CSV file. Optional. Defaults to None.
|
|
autodetect_encoding: Whether to try to autodetect the file encoding.
|
|
"""
|
|
self.file_path = file_path
|
|
self.source_column = source_column
|
|
self.metadata_columns = metadata_columns
|
|
self.encoding = encoding
|
|
self.csv_args = csv_args or {}
|
|
self.autodetect_encoding = autodetect_encoding
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
try:
|
|
with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
|
|
yield from self.__read_file(csvfile)
|
|
except UnicodeDecodeError as e:
|
|
if self.autodetect_encoding:
|
|
detected_encodings = detect_file_encodings(self.file_path)
|
|
for encoding in detected_encodings:
|
|
try:
|
|
with open(
|
|
self.file_path, newline="", encoding=encoding.encoding
|
|
) as csvfile:
|
|
yield from self.__read_file(csvfile)
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
else:
|
|
raise RuntimeError(f"Error loading {self.file_path}") from e
|
|
except Exception as e:
|
|
raise RuntimeError(f"Error loading {self.file_path}") from e
|
|
|
|
def __read_file(self, csvfile: TextIOWrapper) -> Iterator[Document]:
|
|
csv_reader = csv.DictReader(csvfile, **self.csv_args)
|
|
for i, row in enumerate(csv_reader):
|
|
try:
|
|
source = (
|
|
row[self.source_column]
|
|
if self.source_column is not None
|
|
else str(self.file_path)
|
|
)
|
|
except KeyError:
|
|
raise ValueError(
|
|
f"Source column '{self.source_column}' not found in CSV file."
|
|
)
|
|
content = "\n".join(
|
|
f"""{k.strip() if k is not None else k}: {v.strip()
|
|
if isinstance(v, str) else ','.join(map(str.strip, v))
|
|
if isinstance(v, list) else v}"""
|
|
for k, v in row.items()
|
|
if k not in self.metadata_columns
|
|
)
|
|
metadata = {"source": source, "row": i}
|
|
for col in self.metadata_columns:
|
|
try:
|
|
metadata[col] = row[col]
|
|
except KeyError:
|
|
raise ValueError(f"Metadata column '{col}' not found in CSV file.")
|
|
yield Document(page_content=content, metadata=metadata)
|
|
|
|
|
|
class UnstructuredCSVLoader(UnstructuredFileLoader):
|
|
"""Load `CSV` files using `Unstructured`.
|
|
|
|
Like other
|
|
Unstructured loaders, UnstructuredCSVLoader can be used in both
|
|
"single" and "elements" mode. If you use the loader in "elements"
|
|
mode, the CSV file will be a single Unstructured Table element.
|
|
If you use the loader in "elements" mode, an HTML representation
|
|
of the table will be available in the "text_as_html" key in the
|
|
document metadata.
|
|
|
|
Examples
|
|
--------
|
|
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader
|
|
|
|
loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
|
|
docs = loader.load()
|
|
"""
|
|
|
|
def __init__(
|
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
|
):
|
|
"""
|
|
|
|
Args:
|
|
file_path: The path to the CSV file.
|
|
mode: The mode to use when loading the CSV file.
|
|
Optional. Defaults to "single".
|
|
**unstructured_kwargs: Keyword arguments to pass to unstructured.
|
|
"""
|
|
validate_unstructured_version(min_unstructured_version="0.6.8")
|
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
|
|
|
def _get_elements(self) -> List:
|
|
from unstructured.partition.csv import partition_csv
|
|
|
|
return partition_csv(filename=self.file_path, **self.unstructured_kwargs)
|