timeout on file encoding detection (#4479)

1 year ago · 1a5f68d103
parent 4a8b3d230c
commit 1a5f68d103
4 changed files with 41 additions and 15 deletions
--- a/langchain/document_loaders/helpers.py
+++ b/langchain/document_loaders/helpers.py
@ -1,6 +1,7 @@
 """Document loader helpers."""

-from typing import List, NamedTuple, Optional
+import concurrent.futures
+from typing import List, NamedTuple, Optional, cast


 class FileEncoding(NamedTuple):
@ -9,15 +10,28 @@ class FileEncoding(NamedTuple):
    language: Optional[str]


-def detect_file_encodings(file_path: str) -> List[FileEncoding]:
-    """Try to detect the file encoding."""
+def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
+    """Try to detect the file encoding.
+
+    Returns a list of `FileEncoding` tuples with the detected encodings ordered
+    by confidence.
+    """
    import chardet

-    with open(file_path, "rb") as f:
-        rawdata = f.read()
-    encodings = chardet.detect_all(rawdata)
+    def read_and_detect(file_path: str) -> List[dict]:
+        with open(file_path, "rb") as f:
+            rawdata = f.read()
+        return cast(List[dict], chardet.detect_all(rawdata))
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future = executor.submit(read_and_detect, file_path)
+        try:
+            encodings = future.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(
+                f"Timeout reached while detecting encoding for {file_path}"
+            )

    if all(encoding["encoding"] is None for encoding in encodings):
        raise RuntimeError(f"Could not detect encoding for {file_path}")
    return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
-
--- a/langchain/document_loaders/text.py
+++ b/langchain/document_loaders/text.py
@ -45,9 +45,7 @@ class TextLoader(BaseLoader):
                    for encoding in detected_encodings:
                        logger.debug("Trying encoding: ", encoding.encoding)
                        try:
-                            with open(
-                                self.file_path, encoding=encoding.encoding
-                            ) as f:
+                            with open(self.file_path, encoding=encoding.encoding) as f:
                                text = f.read()
                            break
                        except UnicodeDecodeError:
@ -59,5 +57,3 @@ class TextLoader(BaseLoader):

        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]
-
-
--- a/poetry.lock
+++ b/poetry.lock
@ -10024,7 +10024,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
 azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
 cohere = ["cohere"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["pdfminer-six", "pypdf", "tqdm"]
+extended-testing = ["chardet", "pdfminer-six", "pypdf", "tqdm"]
 hnswlib = ["docarray", "hnswlib", "protobuf"]
 in-memory-store = ["docarray"]
 llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
@ -10035,4 +10035,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "5cba09d8e3153c466aced4763e838df2a81f39a87e9578fc56404088fb1e1cb1"
+content-hash = "6aa8b1e18b690223f337de8b345023edd3a9cacc923ea5773baec1eae67b4c44"
--- a/tests/unit_tests/document_loader/test_detect_encoding.py
+++ b/tests/unit_tests/document_loader/test_detect_encoding.py
@ -3,10 +3,11 @@ from pathlib import Path
 import pytest

 from langchain.document_loaders import DirectoryLoader, TextLoader
+from langchain.document_loaders.helpers import detect_file_encodings


@pytest.mark.requires("chardet")
-def test_text_loader_detect_encodings() -> None:
+def test_loader_detect_encoding() -> None:
    """Test text loader."""
    path = Path(__file__).parent.parent / "examples"
    files = path.glob("**/*.txt")
@ -23,3 +24,18 @@ def test_text_loader_detect_encodings() -> None:

    docs = loader_detect_encoding.load()
    assert len(docs) == len(list(files))
+
+
+@pytest.mark.skip(reason="slow test")
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
+    path = Path(tmpdir)
+    file_path = str(path / "blob.txt")
+    # 2mb binary blob
+    with open(file_path, "wb") as f:
+        f.write(b"\x00" * 2_000_000)
+
+    with pytest.raises(TimeoutError):
+        detect_file_encodings(file_path, timeout=1)
+
+    detect_file_encodings(file_path, timeout=10)