Add encoding parameter to TextLoader (#2250)

This merge request proposes changes to the TextLoader class to make it
more flexible and robust when handling text files with different
encodings. The current implementation of TextLoader does not provide a
way to specify the encoding of the text file being read. As a result, it
might lead to incorrect handling of files with non-default encodings,
causing issues with loading the content.

Benefits:
- The proposed changes will make the TextLoader class more flexible,
allowing it to handle text files with different encodings.
- The changes maintain backward compatibility, as the encoding parameter
is optional.
This commit is contained in:
Travis Hammond 2023-04-01 08:57:17 -07:00 committed by GitHub
parent 67dde7d893
commit e49284acde
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,4 @@
"""Load text files.""" from typing import List, Optional
from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -8,13 +7,14 @@ from langchain.document_loaders.base import BaseLoader
class TextLoader(BaseLoader): class TextLoader(BaseLoader):
"""Load text files.""" """Load text files."""
def __init__(self, file_path: str): def __init__(self, file_path: str, encoding: Optional[str] = None):
"""Initialize with file path.""" """Initialize with file path."""
self.file_path = file_path self.file_path = file_path
self.encoding = encoding
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load from file path.""" """Load from file path."""
with open(self.file_path, encoding="utf-8") as f: with open(self.file_path, encoding=self.encoding) as f:
text = f.read() text = f.read()
metadata = {"source": self.file_path} metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)] return [Document(page_content=text, metadata=metadata)]