Added autodetect_encoding option to csvLoader (#11327)

1 year ago · eec53fa294
parent 09c66fe04f
commit eec53fa294
4 changed files with 107 additions and 20 deletions
--- a/libs/langchain/langchain/document_loaders/csv_loader.py
+++ b/libs/langchain/langchain/document_loaders/csv_loader.py
@ -1,8 +1,10 @@
 import csv
+from io import TextIOWrapper
 from typing import Any, Dict, List, Optional

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.document_loaders.unstructured import (
    UnstructuredFileLoader,
    validate_unstructured_version,
@ -36,6 +38,7 @@ class CSVLoader(BaseLoader):
        source_column: Optional[str] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
+        autodetect_encoding: bool = False,
    ):
        """

@ -46,33 +49,58 @@ class CSVLoader(BaseLoader):
            csv_args: A dictionary of arguments to pass to the csv.DictReader.
              Optional. Defaults to None.
            encoding: The encoding of the CSV file. Optional. Defaults to None.
+            autodetect_encoding: Whether to try to autodetect the file encoding.
        """
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}
+        self.autodetect_encoding = autodetect_encoding

    def load(self) -> List[Document]:
        """Load data into document objects."""

        docs = []
-        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
-            csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
-            for i, row in enumerate(csv_reader):
-                content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
-                try:
-                    source = (
-                        row[self.source_column]
-                        if self.source_column is not None
-                        else self.file_path
-                    )
-                except KeyError:
-                    raise ValueError(
-                        f"Source column '{self.source_column}' not found in CSV file."
-                    )
-                metadata = {"source": source, "row": i}
-                doc = Document(page_content=content, metadata=metadata)
-                docs.append(doc)
+        try:
+            with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
+                docs = self.__read_file(csvfile)
+        except UnicodeDecodeError as e:
+            if self.autodetect_encoding:
+                detected_encodings = detect_file_encodings(self.file_path)
+                for encoding in detected_encodings:
+                    try:
+                        with open(
+                            self.file_path, newline="", encoding=encoding.encoding
+                        ) as csvfile:
+                            docs = self.__read_file(csvfile)
+                            break
+                    except UnicodeDecodeError:
+                        continue
+            else:
+                raise RuntimeError(f"Error loading {self.file_path}") from e
+        except Exception as e:
+            raise RuntimeError(f"Error loading {self.file_path}") from e
+
+        return docs
+
+    def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
+        docs = []
+        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
+        for i, row in enumerate(csv_reader):
+            content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
+            try:
+                source = (
+                    row[self.source_column]
+                    if self.source_column is not None
+                    else self.file_path
+                )
+            except KeyError:
+                raise ValueError(
+                    f"Source column '{self.source_column}' not found in CSV file."
+                )
+            metadata = {"source": source, "row": i}
+            doc = Document(page_content=content, metadata=metadata)
+            docs.append(doc)

        return docs

--- a/libs/langchain/tests/unit_tests/document_loaders/test_detect_encoding.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_detect_encoding.py
@ -2,12 +2,12 @@ from pathlib import Path

 import pytest

-from langchain.document_loaders import DirectoryLoader, TextLoader
+from langchain.document_loaders import CSVLoader, DirectoryLoader, TextLoader
 from langchain.document_loaders.helpers import detect_file_encodings


@pytest.mark.requires("chardet")
-def test_loader_detect_encoding() -> None:
+def test_loader_detect_encoding_text() -> None:
    """Test text loader."""
    path = Path(__file__).parent.parent / "examples"
    files = path.glob("**/*.txt")
@ -16,7 +16,7 @@ def test_loader_detect_encoding() -> None:
        str(path),
        glob="**/*.txt",
        loader_kwargs={"autodetect_encoding": True},
-        loader_cls=TextLoader,
+        loader_cls=TextLoader,  # type: ignore
    )

    with pytest.raises((UnicodeDecodeError, RuntimeError)):
@ -26,6 +26,43 @@ def test_loader_detect_encoding() -> None:
    assert len(docs) == len(list(files))


+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding_csv() -> None:
+    """Test csv loader."""
+    path = Path(__file__).parent.parent / "examples"
+    files = path.glob("**/*.csv")
+
+    # Count the number of lines.
+    row_count = 0
+    for file in files:
+        encodings = detect_file_encodings(str(file))
+        for encoding in encodings:
+            try:
+                row_count += sum(1 for line in open(file, encoding=encoding.encoding))
+                break
+            except UnicodeDecodeError:
+                continue
+        # CSVLoader uses DictReader, and one line per file is a header,
+        # so subtract the number of files.
+        row_count -= 1
+
+    loader = DirectoryLoader(
+        str(path), glob="**/*.csv", loader_cls=CSVLoader  # type: ignore
+    )
+    loader_detect_encoding = DirectoryLoader(
+        str(path),
+        glob="**/*.csv",
+        loader_kwargs={"autodetect_encoding": True},
+        loader_cls=CSVLoader,  # type: ignore
+    )
+
+    with pytest.raises((UnicodeDecodeError, RuntimeError)):
+        loader.load()
+
+    docs = loader_detect_encoding.load()
+    assert len(docs) == row_count
+
+
@pytest.mark.skip(reason="slow test")
@pytest.mark.requires("chardet")
 def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
--- a/libs/langchain/tests/unit_tests/examples/example-non-utf8.csv
+++ b/libs/langchain/tests/unit_tests/examples/example-non-utf8.csv
@ -0,0 +1,11 @@
+行ID,製品名,顧客名,顧客ID,売上,価格,送料,都道府県,製品カテゴリ,割引
+1,"Eldon スタッカブル収納棚用ベース、プラチナ",モハメド・マッキンタイア,3,-213.25,38.94,35,ヌナブット準州,保管と整理,0.8
+2,"1.7立方フィートのコンパクト「キューブ」オフィス冷蔵庫",バリー・フレンチ,293,457.81,208.16,68.02,ヌナブット準州,家電製品,0.58
+3,"Cardinal Slant-D? リング バインダー、ヘビーゲージ ビニール",バリー・フレンチ,293,46.71,8.69,2.99,ヌナブット準州,バインダーおよびバインダー付属品,0.39
+4,"R380",クレイ・ロゼンダル,483,1198.97,195.99,3.99,ヌナブット準州,電話と通信,0.58
+5,"ホームズ HEPA 空気清浄機",カルロス・ソルテロ,515,30.94,21.78,5.94,ヌナブット準州,家電製品,0.5
+6,"GE 長寿命の屋内埋込型投光器電球",カルロス・ソルテロ,515,4.43,6.64,4.95,ヌナブット準州,オフィス家具,0.37
+7,"ロックリング付きアングルDバインダー、ラベルホルダー",カール・ジャクソン,613,-54.04,7.3,7.72,ヌナブット準州,バインダーおよびバインダー付属品,0.38
+8,"SAFCO モバイルデスクサイドファイル ワイヤーフレーム",カール・ジャクソン,613,127.70,42.76,6.22,ヌナブット準州,保管と整理,
+9,"SAFCO 業務用ワイヤーシェルフ ブラック",モニカ・フェデル,643,-695.26,138.14,35,ヌナブット準州,保管と整理,
+10,"ゼロックス 198",ドロシー・バッダーズ,678,-226.36,4.98,8.33,ヌナブット準州,紙,0.38
--- a/libs/langchain/tests/unit_tests/examples/example-utf8.csv
+++ b/libs/langchain/tests/unit_tests/examples/example-utf8.csv
@ -0,0 +1,11 @@
+"Row ID","Product Name","Customer Name","Customer ID","Sales","Price","Shipping Cost","Province","Product Category","Discount"
+1,"Eldon Base for stackable storage shelf, platinum",Muhammed MacIntyre,3,-213.25,38.94,35,Nunavut,Storage & Organization,0.8
+2,"1.7 Cubic Foot Compact ""Cube"" Office Refrigerators",Barry French,293,457.81,208.16,68.02,Nunavut,Appliances,0.58
+3,"Cardinal Slant-D® Ring Binder, Heavy Gauge Vinyl",Barry French,293,46.71,8.69,2.99,Nunavut,Binders and Binder Accessories,0.39
+4,R380,Clay Rozendal,483,1198.97,195.99,3.99,Nunavut,Telephones and Communication,0.58
+5,Holmes HEPA Air Purifier,Carlos Soltero,515,30.94,21.78,5.94,Nunavut,Appliances,0.5
+6,G.E. Longer-Life Indoor Recessed Floodlight Bulbs,Carlos Soltero,515,4.43,6.64,4.95,Nunavut,Office Furnishings,0.37
+7,"Angle-D Binders with Locking Rings, Label Holders",Carl Jackson,613,-54.04,7.3,7.72,Nunavut,Binders and Binder Accessories,0.38
+8,"SAFCO Mobile Desk Side File, Wire Frame",Carl Jackson,613,127.70,42.76,6.22,Nunavut,Storage & Organization,
+9,"SAFCO Commercial Wire Shelving, Black",Monica Federle,643,-695.26,138.14,35,Nunavut,Storage & Organization,
+10,Xerox 198,Dorothy Badders,678,-226.36,4.98,8.33,Nunavut,Paper,0.38