langchain/tests/unit_tests/document_loaders/test_detect_encoding.py
Eugene Yurtsev e46202829f
feat #4479: TextLoader auto detect encoding and improved exceptions (#4927)
# TextLoader auto detect encoding and enhanced exception handling

- Add an option to enable encoding detection on `TextLoader`. 
- The detection is done using `chardet`
- The loading is done by trying all detected encodings by order of
confidence or raise an exception otherwise.

### New Dependencies:
- `chardet`

Fixes #4479 

## Before submitting

<!-- If you're adding a new integration, include an integration test and
an example notebook showing its use! -->

## Who can review?

Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:

- @eyurtsev

---------

Co-authored-by: blob42 <spike@w530>
2023-05-18 09:55:14 -04:00

42 lines
1.2 KiB
Python

from pathlib import Path
import pytest
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.document_loaders.helpers import detect_file_encodings
@pytest.mark.requires("chardet")
def test_loader_detect_encoding() -> None:
"""Test text loader."""
path = Path(__file__).parent.parent / "examples"
files = path.glob("**/*.txt")
loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
loader_detect_encoding = DirectoryLoader(
str(path),
glob="**/*.txt",
loader_kwargs={"autodetect_encoding": True},
loader_cls=TextLoader,
)
with pytest.raises((UnicodeDecodeError, RuntimeError)):
loader.load()
docs = loader_detect_encoding.load()
assert len(docs) == len(list(files))
@pytest.mark.skip(reason="slow test")
@pytest.mark.requires("chardet")
def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
path = Path(tmpdir)
file_path = str(path / "blob.txt")
# 2mb binary blob
with open(file_path, "wb") as f:
f.write(b"\x00" * 2_000_000)
with pytest.raises(TimeoutError):
detect_file_encodings(file_path, timeout=1)
detect_file_encodings(file_path, timeout=10)