move encoding detection to own file (#4479)

- encoding detection returns meaningful encodings schema
parallel_dir_loader_back
blob42 1 year ago
parent 760a4f72f4
commit 4a8b3d230c

@ -0,0 +1,23 @@
"""Document loader helpers."""
from typing import List, NamedTuple, Optional
class FileEncoding(NamedTuple):
encoding: Optional[str]
confidence: float
language: Optional[str]
def detect_file_encodings(file_path: str) -> List[FileEncoding]:
"""Try to detect the file encoding."""
import chardet
with open(file_path, "rb") as f:
rawdata = f.read()
encodings = chardet.detect_all(rawdata)
if all(encoding["encoding"] is None for encoding in encodings):
raise RuntimeError(f"Could not detect encoding for {file_path}")
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]

@ -1,8 +1,9 @@
import logging
from typing import List, Optional, cast
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
logger = logging.getLogger(__name__)
@ -42,10 +43,10 @@ class TextLoader(BaseLoader):
if self.autodetect_encoding:
detected_encodings = detect_file_encodings(self.file_path)
for encoding in detected_encodings:
logger.debug("Trying encoding: ", encoding["encoding"])
logger.debug("Trying encoding: ", encoding.encoding)
try:
with open(
self.file_path, encoding=encoding["encoding"]
self.file_path, encoding=encoding.encoding
) as f:
text = f.read()
break
@ -59,17 +60,4 @@ class TextLoader(BaseLoader):
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
def detect_file_encodings(file_path: str) -> List[dict]:
"""Try to detect the file encoding."""
import chardet
with open(file_path, "rb") as f:
rawdata = f.read()
encodings = chardet.detect_all(rawdata)
if all(encoding["encoding"] is None for encoding in encodings):
raise RuntimeError(f"Could not detect encoding for {file_path}")
res = [encoding for encoding in encodings if encoding["encoding"] is not None]
return cast(List[dict], res)

@ -4,23 +4,22 @@ import pytest
from langchain.document_loaders import DirectoryLoader, TextLoader
class TestTextLoader:
@pytest.mark.requires("chardet")
def test_load_directory(self) -> None:
"""Test text loader."""
path = Path(__file__).parent.parent / "examples"
files = path.glob("**/*.txt")
loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
loader_detect_encoding = DirectoryLoader(
str(path),
glob="**/*.txt",
loader_kwargs={"autodetect_encoding": True},
loader_cls=TextLoader,
)
@pytest.mark.requires("chardet")
def test_text_loader_detect_encodings() -> None:
"""Test text loader."""
path = Path(__file__).parent.parent / "examples"
files = path.glob("**/*.txt")
loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
loader_detect_encoding = DirectoryLoader(
str(path),
glob="**/*.txt",
loader_kwargs={"autodetect_encoding": True},
loader_cls=TextLoader,
)
with pytest.raises((UnicodeDecodeError, RuntimeError)):
loader.load()
with pytest.raises((UnicodeDecodeError, RuntimeError)):
loader.load()
docs = loader_detect_encoding.load()
assert len(docs) == len(list(files))
docs = loader_detect_encoding.load()
assert len(docs) == len(list(files))

Loading…
Cancel
Save