diff --git a/langchain/document_loaders/helpers.py b/langchain/document_loaders/helpers.py index 4e0ed6d3..3ccf4f7d 100644 --- a/langchain/document_loaders/helpers.py +++ b/langchain/document_loaders/helpers.py @@ -1,6 +1,7 @@ """Document loader helpers.""" -from typing import List, NamedTuple, Optional +import concurrent.futures +from typing import List, NamedTuple, Optional, cast class FileEncoding(NamedTuple): @@ -9,15 +10,28 @@ class FileEncoding(NamedTuple): language: Optional[str] -def detect_file_encodings(file_path: str) -> List[FileEncoding]: - """Try to detect the file encoding.""" +def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]: + """Try to detect the file encoding. + + Returns a list of `FileEncoding` tuples with the detected encodings ordered + by confidence. + """ import chardet - with open(file_path, "rb") as f: - rawdata = f.read() - encodings = chardet.detect_all(rawdata) + def read_and_detect(file_path: str) -> List[dict]: + with open(file_path, "rb") as f: + rawdata = f.read() + return cast(List[dict], chardet.detect_all(rawdata)) + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(read_and_detect, file_path) + try: + encodings = future.result(timeout=timeout) + except concurrent.futures.TimeoutError: + raise TimeoutError( + f"Timeout reached while detecting encoding for {file_path}" + ) if all(encoding["encoding"] is None for encoding in encodings): raise RuntimeError(f"Could not detect encoding for {file_path}") return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None] - diff --git a/langchain/document_loaders/text.py b/langchain/document_loaders/text.py index 441d6ea4..8a888b9e 100644 --- a/langchain/document_loaders/text.py +++ b/langchain/document_loaders/text.py @@ -45,9 +45,7 @@ class TextLoader(BaseLoader): for encoding in detected_encodings: logger.debug("Trying encoding: ", encoding.encoding) try: - with open( - self.file_path, encoding=encoding.encoding - ) as f: + with open(self.file_path, encoding=encoding.encoding) as f: text = f.read() break except UnicodeDecodeError: @@ -59,5 +57,3 @@ class TextLoader(BaseLoader): metadata = {"source": self.file_path} return [Document(page_content=text, metadata=metadata)] - - diff --git a/poetry.lock b/poetry.lock index d84f3f91..8648fc46 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10024,7 +10024,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] -extended-testing = ["pdfminer-six", "pypdf", "tqdm"] +extended-testing = ["chardet", "pdfminer-six", "pypdf", "tqdm"] hnswlib = ["docarray", "hnswlib", "protobuf"] in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] @@ -10035,4 +10035,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "5cba09d8e3153c466aced4763e838df2a81f39a87e9578fc56404088fb1e1cb1" +content-hash = "6aa8b1e18b690223f337de8b345023edd3a9cacc923ea5773baec1eae67b4c44" diff --git a/tests/unit_tests/document_loader/test_text_loader.py b/tests/unit_tests/document_loader/test_detect_encoding.py similarity index 55% rename from tests/unit_tests/document_loader/test_text_loader.py rename to tests/unit_tests/document_loader/test_detect_encoding.py index 1363da06..5cee5cd8 100644 --- a/tests/unit_tests/document_loader/test_text_loader.py +++ b/tests/unit_tests/document_loader/test_detect_encoding.py @@ -3,10 +3,11 @@ from pathlib import Path import pytest from langchain.document_loaders import DirectoryLoader, TextLoader +from langchain.document_loaders.helpers import detect_file_encodings @pytest.mark.requires("chardet") -def test_text_loader_detect_encodings() -> None: +def test_loader_detect_encoding() -> None: """Test text loader.""" path = Path(__file__).parent.parent / "examples" files = path.glob("**/*.txt") @@ -23,3 +24,18 @@ def test_text_loader_detect_encodings() -> None: docs = loader_detect_encoding.load() assert len(docs) == len(list(files)) + + +@pytest.mark.skip(reason="slow test") +@pytest.mark.requires("chardet") +def test_loader_detect_encoding_timeout(tmpdir: str) -> None: + path = Path(tmpdir) + file_path = str(path / "blob.txt") + # 2mb binary blob + with open(file_path, "wb") as f: + f.write(b"\x00" * 2_000_000) + + with pytest.raises(TimeoutError): + detect_file_encodings(file_path, timeout=1) + + detect_file_encodings(file_path, timeout=10)