langchain/libs/community/tests/unit_tests/document_loaders/test_detect_encoding.py

from pathlib import Path

import pytest

from langchain_community.document_loaders import CSVLoader, DirectoryLoader, TextLoader
from langchain_community.document_loaders.helpers import detect_file_encodings


@pytest.mark.requires("chardet")
def test_loader_detect_encoding_text() -> None:
    """Test text loader."""
    path = Path(__file__).parent.parent / "examples"
    files = path.glob("**/*.txt")
    loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
    loader_detect_encoding = DirectoryLoader(
        str(path),
        glob="**/*.txt",
        loader_kwargs={"autodetect_encoding": True},
        loader_cls=TextLoader,  # type: ignore
    )

    with pytest.raises((UnicodeDecodeError, RuntimeError)):
        loader.load()

    docs = loader_detect_encoding.load()
    assert len(docs) == len(list(files))


@pytest.mark.requires("chardet")
def test_loader_detect_encoding_csv() -> None:
    """Test csv loader."""
    path = Path(__file__).parent.parent / "examples"
    files = path.glob("**/*.csv")

    # Count the number of lines.
    row_count = 0
    for file in files:
        encodings = detect_file_encodings(str(file))
        for encoding in encodings:
            try:
                row_count += sum(1 for line in open(file, encoding=encoding.encoding))
                break
            except UnicodeDecodeError:
                continue
        # CSVLoader uses DictReader, and one line per file is a header,
        # so subtract the number of files.
        row_count -= 1

    loader = DirectoryLoader(
        str(path),
        glob="**/*.csv",
        loader_cls=CSVLoader,  # type: ignore
    )
    loader_detect_encoding = DirectoryLoader(
        str(path),
        glob="**/*.csv",
        loader_kwargs={"autodetect_encoding": True},
        loader_cls=CSVLoader,  # type: ignore
    )

    with pytest.raises((UnicodeDecodeError, RuntimeError)):
        loader.load()

    docs = loader_detect_encoding.load()
    assert len(docs) == row_count


@pytest.mark.skip(reason="slow test")
@pytest.mark.requires("chardet")
def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
    path = Path(tmpdir)
    file_path = str(path / "blob.txt")
    # 2mb binary blob
    with open(file_path, "wb") as f:
        f.write(b"\x00" * 2_000_000)

    with pytest.raises(TimeoutError):
        detect_file_encodings(file_path, timeout=1)

    detect_file_encodings(file_path, timeout=10)
feat #4479: TextLoader auto detect encoding and improved exceptions (#4927) # TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530> 1 year ago			`from pathlib import Path`

			`import pytest`

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 9 months ago			`from langchain_community.document_loaders import CSVLoader, DirectoryLoader, TextLoader`
			`from langchain_community.document_loaders.helpers import detect_file_encodings`
feat #4479: TextLoader auto detect encoding and improved exceptions (#4927) # TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530> 1 year ago

			`@pytest.mark.requires("chardet")`
Added autodetect_encoding option to csvLoader (#11327) 11 months ago			`def test_loader_detect_encoding_text() -> None:`
feat #4479: TextLoader auto detect encoding and improved exceptions (#4927) # TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530> 1 year ago			`"""Test text loader."""`
			`path = Path(__file__).parent.parent / "examples"`
			`files = path.glob("*/.txt")`
			`loader = DirectoryLoader(str(path), glob="*/.txt", loader_cls=TextLoader)`
			`loader_detect_encoding = DirectoryLoader(`
			`str(path),`
			`glob="*/.txt",`
			`loader_kwargs={"autodetect_encoding": True},`
Added autodetect_encoding option to csvLoader (#11327) 11 months ago			`loader_cls=TextLoader, # type: ignore`
feat #4479: TextLoader auto detect encoding and improved exceptions (#4927) # TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530> 1 year ago			`)`

			`with pytest.raises((UnicodeDecodeError, RuntimeError)):`
			`loader.load()`

			`docs = loader_detect_encoding.load()`
			`assert len(docs) == len(list(files))`


Added autodetect_encoding option to csvLoader (#11327) 11 months ago			`@pytest.mark.requires("chardet")`
			`def test_loader_detect_encoding_csv() -> None:`
			`"""Test csv loader."""`
			`path = Path(__file__).parent.parent / "examples"`
			`files = path.glob("*/.csv")`

			`# Count the number of lines.`
			`row_count = 0`
			`for file in files:`
			`encodings = detect_file_encodings(str(file))`
			`for encoding in encodings:`
			`try:`
			`row_count += sum(1 for line in open(file, encoding=encoding.encoding))`
			`break`
			`except UnicodeDecodeError:`
			`continue`
			`# CSVLoader uses DictReader, and one line per file is a header,`
			`# so subtract the number of files.`
			`row_count -= 1`

			`loader = DirectoryLoader(`
Install and use `ruff format` instead of black for code formatting. (#12585) Best to review one commit at a time, since two of the commits are 100% autogenerated changes from running `ruff format`: - Install and use `ruff format` instead of black for code formatting. - Output of `ruff format .` in the `langchain` package. - Use `ruff format` in experimental package. - Format changes in experimental package by `ruff format`. - Manual formatting fixes to make `ruff .` pass. 11 months ago			`str(path),`
			`glob="*/.csv",`
			`loader_cls=CSVLoader, # type: ignore`
Added autodetect_encoding option to csvLoader (#11327) 11 months ago			`)`
			`loader_detect_encoding = DirectoryLoader(`
			`str(path),`
			`glob="*/.csv",`
			`loader_kwargs={"autodetect_encoding": True},`
			`loader_cls=CSVLoader, # type: ignore`
			`)`

			`with pytest.raises((UnicodeDecodeError, RuntimeError)):`
			`loader.load()`

			`docs = loader_detect_encoding.load()`
			`assert len(docs) == row_count`


feat #4479: TextLoader auto detect encoding and improved exceptions (#4927) # TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 <spike@w530> 1 year ago			`@pytest.mark.skip(reason="slow test")`
			`@pytest.mark.requires("chardet")`
			`def test_loader_detect_encoding_timeout(tmpdir: str) -> None:`
			`path = Path(tmpdir)`
			`file_path = str(path / "blob.txt")`
			`# 2mb binary blob`
			`with open(file_path, "wb") as f:`
			`f.write(b"\x00" * 2_000_000)`

			`with pytest.raises(TimeoutError):`
			`detect_file_encodings(file_path, timeout=1)`

			`detect_file_encodings(file_path, timeout=10)`