You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/helpers.py

24 lines
656 B
Python

"""Document loader helpers."""
from typing import List, NamedTuple, Optional
class FileEncoding(NamedTuple):
encoding: Optional[str]
confidence: float
language: Optional[str]
def detect_file_encodings(file_path: str) -> List[FileEncoding]:
"""Try to detect the file encoding."""
import chardet
with open(file_path, "rb") as f:
rawdata = f.read()
encodings = chardet.detect_all(rawdata)
if all(encoding["encoding"] is None for encoding in encodings):
raise RuntimeError(f"Could not detect encoding for {file_path}")
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]