From 4a8b3d230c6fee8ceb537045ff9ef41b0440bf2b Mon Sep 17 00:00:00 2001 From: blob42 Date: Fri, 12 May 2023 20:31:57 +0200 Subject: [PATCH] move encoding detection to own file (#4479) - encoding detection returns meaningful encodings schema --- langchain/document_loaders/helpers.py | 23 +++++++++++++ langchain/document_loaders/text.py | 20 +++-------- .../document_loader/test_text_loader.py | 33 +++++++++---------- 3 files changed, 43 insertions(+), 33 deletions(-) create mode 100644 langchain/document_loaders/helpers.py diff --git a/langchain/document_loaders/helpers.py b/langchain/document_loaders/helpers.py new file mode 100644 index 00000000..4e0ed6d3 --- /dev/null +++ b/langchain/document_loaders/helpers.py @@ -0,0 +1,23 @@ +"""Document loader helpers.""" + +from typing import List, NamedTuple, Optional + + +class FileEncoding(NamedTuple): + encoding: Optional[str] + confidence: float + language: Optional[str] + + +def detect_file_encodings(file_path: str) -> List[FileEncoding]: + """Try to detect the file encoding.""" + import chardet + + with open(file_path, "rb") as f: + rawdata = f.read() + encodings = chardet.detect_all(rawdata) + + if all(encoding["encoding"] is None for encoding in encodings): + raise RuntimeError(f"Could not detect encoding for {file_path}") + return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None] + diff --git a/langchain/document_loaders/text.py b/langchain/document_loaders/text.py index d947d765..441d6ea4 100644 --- a/langchain/document_loaders/text.py +++ b/langchain/document_loaders/text.py @@ -1,8 +1,9 @@ import logging -from typing import List, Optional, cast +from typing import List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings logger = logging.getLogger(__name__) @@ -42,10 +43,10 @@ class TextLoader(BaseLoader): if self.autodetect_encoding: detected_encodings = detect_file_encodings(self.file_path) for encoding in detected_encodings: - logger.debug("Trying encoding: ", encoding["encoding"]) + logger.debug("Trying encoding: ", encoding.encoding) try: with open( - self.file_path, encoding=encoding["encoding"] + self.file_path, encoding=encoding.encoding ) as f: text = f.read() break @@ -59,17 +60,4 @@ class TextLoader(BaseLoader): metadata = {"source": self.file_path} return [Document(page_content=text, metadata=metadata)] -def detect_file_encodings(file_path: str) -> List[dict]: - """Try to detect the file encoding.""" - import chardet - - with open(file_path, "rb") as f: - rawdata = f.read() - encodings = chardet.detect_all(rawdata) - - - if all(encoding["encoding"] is None for encoding in encodings): - raise RuntimeError(f"Could not detect encoding for {file_path}") - res = [encoding for encoding in encodings if encoding["encoding"] is not None] - return cast(List[dict], res) diff --git a/tests/unit_tests/document_loader/test_text_loader.py b/tests/unit_tests/document_loader/test_text_loader.py index 79187ed9..1363da06 100644 --- a/tests/unit_tests/document_loader/test_text_loader.py +++ b/tests/unit_tests/document_loader/test_text_loader.py @@ -4,23 +4,22 @@ import pytest from langchain.document_loaders import DirectoryLoader, TextLoader -class TestTextLoader: - @pytest.mark.requires("chardet") - def test_load_directory(self) -> None: - """Test text loader.""" - path = Path(__file__).parent.parent / "examples" - files = path.glob("**/*.txt") - loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader) - loader_detect_encoding = DirectoryLoader( - str(path), - glob="**/*.txt", - loader_kwargs={"autodetect_encoding": True}, - loader_cls=TextLoader, - ) +@pytest.mark.requires("chardet") +def test_text_loader_detect_encodings() -> None: + """Test text loader.""" + path = Path(__file__).parent.parent / "examples" + files = path.glob("**/*.txt") + loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader) + loader_detect_encoding = DirectoryLoader( + str(path), + glob="**/*.txt", + loader_kwargs={"autodetect_encoding": True}, + loader_cls=TextLoader, + ) - with pytest.raises((UnicodeDecodeError, RuntimeError)): - loader.load() + with pytest.raises((UnicodeDecodeError, RuntimeError)): + loader.load() - docs = loader_detect_encoding.load() - assert len(docs) == len(list(files)) + docs = loader_detect_encoding.load() + assert len(docs) == len(list(files))