forked from Archives/langchain
timeout on file encoding detection (#4481)
This commit is contained in:
parent
5837d18cbe
commit
3805aef263
@ -1,6 +1,7 @@
|
||||
"""Document loader helpers."""
|
||||
|
||||
from typing import List, NamedTuple, Optional
|
||||
import concurrent.futures
|
||||
from typing import List, NamedTuple, Optional, cast
|
||||
|
||||
|
||||
class FileEncoding(NamedTuple):
|
||||
@ -9,15 +10,28 @@ class FileEncoding(NamedTuple):
|
||||
language: Optional[str]
|
||||
|
||||
|
||||
def detect_file_encodings(file_path: str) -> List[FileEncoding]:
|
||||
"""Try to detect the file encoding."""
|
||||
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
||||
"""Try to detect the file encoding.
|
||||
|
||||
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||
by confidence.
|
||||
"""
|
||||
import chardet
|
||||
|
||||
def read_and_detect(file_path: str) -> List[dict]:
|
||||
with open(file_path, "rb") as f:
|
||||
rawdata = f.read()
|
||||
encodings = chardet.detect_all(rawdata)
|
||||
return cast(List[dict], chardet.detect_all(rawdata))
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(read_and_detect, file_path)
|
||||
try:
|
||||
encodings = future.result(timeout=timeout)
|
||||
except concurrent.futures.TimeoutError:
|
||||
raise TimeoutError(
|
||||
f"Timeout reached while detecting encoding for {file_path}"
|
||||
)
|
||||
|
||||
if all(encoding["encoding"] is None for encoding in encodings):
|
||||
raise RuntimeError(f"Could not detect encoding for {file_path}")
|
||||
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
|
||||
|
||||
|
@ -45,9 +45,7 @@ class TextLoader(BaseLoader):
|
||||
for encoding in detected_encodings:
|
||||
logger.debug("Trying encoding: ", encoding.encoding)
|
||||
try:
|
||||
with open(
|
||||
self.file_path, encoding=encoding.encoding
|
||||
) as f:
|
||||
with open(self.file_path, encoding=encoding.encoding) as f:
|
||||
text = f.read()
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
@ -59,5 +57,3 @@ class TextLoader(BaseLoader):
|
||||
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
||||
|
||||
|
4
poetry.lock
generated
4
poetry.lock
generated
@ -10024,7 +10024,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
|
||||
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
||||
cohere = ["cohere"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["pdfminer-six", "pypdf", "tqdm"]
|
||||
extended-testing = ["chardet", "pdfminer-six", "pypdf", "tqdm"]
|
||||
hnswlib = ["docarray", "hnswlib", "protobuf"]
|
||||
in-memory-store = ["docarray"]
|
||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
||||
@ -10035,4 +10035,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "5cba09d8e3153c466aced4763e838df2a81f39a87e9578fc56404088fb1e1cb1"
|
||||
content-hash = "6aa8b1e18b690223f337de8b345023edd3a9cacc923ea5773baec1eae67b4c44"
|
||||
|
@ -3,10 +3,11 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import DirectoryLoader, TextLoader
|
||||
from langchain.document_loaders.helpers import detect_file_encodings
|
||||
|
||||
|
||||
@pytest.mark.requires("chardet")
|
||||
def test_text_loader_detect_encodings() -> None:
|
||||
def test_loader_detect_encoding() -> None:
|
||||
"""Test text loader."""
|
||||
path = Path(__file__).parent.parent / "examples"
|
||||
files = path.glob("**/*.txt")
|
||||
@ -23,3 +24,18 @@ def test_text_loader_detect_encodings() -> None:
|
||||
|
||||
docs = loader_detect_encoding.load()
|
||||
assert len(docs) == len(list(files))
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="slow test")
|
||||
@pytest.mark.requires("chardet")
|
||||
def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
|
||||
path = Path(tmpdir)
|
||||
file_path = str(path / "blob.txt")
|
||||
# 2mb binary blob
|
||||
with open(file_path, "wb") as f:
|
||||
f.write(b"\x00" * 2_000_000)
|
||||
|
||||
with pytest.raises(TimeoutError):
|
||||
detect_file_encodings(file_path, timeout=1)
|
||||
|
||||
detect_file_encodings(file_path, timeout=10)
|
Loading…
Reference in New Issue
Block a user