diff --git a/docs/modules/indexes/document_loaders/examples/file_directory.ipynb b/docs/modules/indexes/document_loaders/examples/file_directory.ipynb
index 996f8f9db7..93b70aad73 100644
--- a/docs/modules/indexes/document_loaders/examples/file_directory.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/file_directory.ipynb
@@ -12,7 +12,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"id": "019d8520",
"metadata": {},
"outputs": [],
@@ -151,7 +151,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 15,
"id": "81c92da3",
"metadata": {},
"outputs": [],
@@ -210,7 +210,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 14,
"id": "c558bd73",
"metadata": {},
"outputs": [],
@@ -259,13 +259,292 @@
"len(docs)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "6411a0cb",
+ "metadata": {},
+ "source": [
+ "## Auto detect file encodings with TextLoader\n",
+ "\n",
+ "In this example we will see some strategies that can be useful when loading a big list of arbitrary files from a directory using the `TextLoader` class.\n",
+ "\n",
+ "First to illustrate the problem, let's try to load multiple text with arbitrary encodings."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
- "id": "6a91a0bc",
+ "execution_count": 16,
+ "id": "2c787a69",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path = '../../../../../tests/integration_tests/examples'\n",
+ "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9001e12",
+ "metadata": {},
+ "source": [
+ "### A. Default Behavior"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "b1e88c31",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+ "│ /data/source/langchain/langchain/document_loaders/text.py:29 in load │\n",
+ "│ │\n",
+ "│ 26 │ │ text = \"\" │\n",
+ "│ 27 │ │ with open(self.file_path, encoding=self.encoding) as f: │\n",
+ "│ 28 │ │ │ try: │\n",
+ "│ ❱ 29 │ │ │ │ text = f.read() │\n",
+ "│ 30 │ │ │ except UnicodeDecodeError as e: │\n",
+ "│ 31 │ │ │ │ if self.autodetect_encoding: │\n",
+ "│ 32 │ │ │ │ │ detected_encodings = self.detect_file_encodings() │\n",
+ "│ │\n",
+ "│ /home/spike/.pyenv/versions/3.9.11/lib/python3.9/codecs.py:322 in decode │\n",
+ "│ │\n",
+ "│ 319 │ def decode(self, input, final=False): │\n",
+ "│ 320 │ │ # decode input (taking the buffer into account) │\n",
+ "│ 321 │ │ data = self.buffer + input │\n",
+ "│ ❱ 322 │ │ (result, consumed) = self._buffer_decode(data, self.errors, final) │\n",
+ "│ 323 │ │ # keep undecoded input until the next call │\n",
+ "│ 324 │ │ self.buffer = data[consumed:] │\n",
+ "│ 325 │ │ return result │\n",
+ "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+ "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xca in position 0: invalid continuation byte\n",
+ "\n",
+ "The above exception was the direct cause of the following exception:\n",
+ "\n",
+ "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+ "│ in <module>:1 │\n",
+ "│ │\n",
+ "│ ❱ 1 loader.load() │\n",
+ "│ 2 │\n",
+ "│ │\n",
+ "│ /data/source/langchain/langchain/document_loaders/directory.py:84 in load │\n",
+ "│ │\n",
+ "│ 81 │ │ │ │ │ │ if self.silent_errors: │\n",
+ "│ 82 │ │ │ │ │ │ │ logger.warning(e) │\n",
+ "│ 83 │ │ │ │ │ │ else: │\n",
+ "│ ❱ 84 │ │ │ │ │ │ │ raise e │\n",
+ "│ 85 │ │ │ │ │ finally: │\n",
+ "│ 86 │ │ │ │ │ │ if pbar: │\n",
+ "│ 87 │ │ │ │ │ │ │ pbar.update(1) │\n",
+ "│ │\n",
+ "│ /data/source/langchain/langchain/document_loaders/directory.py:78 in load │\n",
+ "│ │\n",
+ "│ 75 │ │ │ if i.is_file(): │\n",
+ "│ 76 │ │ │ │ if _is_visible(i.relative_to(p)) or self.load_hidden: │\n",
+ "│ 77 │ │ │ │ │ try: │\n",
+ "│ ❱ 78 │ │ │ │ │ │ sub_docs = self.loader_cls(str(i), **self.loader_kwargs).load() │\n",
+ "│ 79 │ │ │ │ │ │ docs.extend(sub_docs) │\n",
+ "│ 80 │ │ │ │ │ except Exception as e: │\n",
+ "│ 81 │ │ │ │ │ │ if self.silent_errors: │\n",
+ "│ │\n",
+ "│ /data/source/langchain/langchain/document_loaders/text.py:44 in load │\n",
+ "│ │\n",
+ "│ 41 │ │ │ │ │ │ except UnicodeDecodeError: │\n",
+ "│ 42 │ │ │ │ │ │ │ continue │\n",
+ "│ 43 │ │ │ │ else: │\n",
+ "│ ❱ 44 │ │ │ │ │ raise RuntimeError(f\"Error loading {self.file_path}\") from e │\n",
+ "│ 45 │ │ │ except Exception as e: │\n",
+ "│ 46 │ │ │ │ raise RuntimeError(f\"Error loading {self.file_path}\") from e │\n",
+ "│ 47 │\n",
+ "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+ "RuntimeError: Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m29\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m26 \u001b[0m\u001b[2m│ │ \u001b[0mtext = \u001b[33m\"\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m27 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mopen\u001b[0m(\u001b[96mself\u001b[0m.file_path, encoding=\u001b[96mself\u001b[0m.encoding) \u001b[94mas\u001b[0m f: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m28 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m29 \u001b[2m│ │ │ │ \u001b[0mtext = f.read() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m30 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m31 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.autodetect_encoding: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m32 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mdetected_encodings = \u001b[96mself\u001b[0m.detect_file_encodings() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/home/spike/.pyenv/versions/3.9.11/lib/python3.9/\u001b[0m\u001b[1;33mcodecs.py\u001b[0m:\u001b[94m322\u001b[0m in \u001b[92mdecode\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 319 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdecode\u001b[0m(\u001b[96mself\u001b[0m, \u001b[96minput\u001b[0m, final=\u001b[94mFalse\u001b[0m): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 320 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# decode input (taking the buffer into account)\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 321 \u001b[0m\u001b[2m│ │ \u001b[0mdata = \u001b[96mself\u001b[0m.buffer + \u001b[96minput\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 322 \u001b[2m│ │ \u001b[0m(result, consumed) = \u001b[96mself\u001b[0m._buffer_decode(data, \u001b[96mself\u001b[0m.errors, final) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 323 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# keep undecoded input until the next call\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 324 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.buffer = data[consumed:] \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m 325 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m result \u001b[31m│\u001b[0m\n",
+ "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+ "\u001b[1;91mUnicodeDecodeError: \u001b[0m\u001b[32m'utf-8'\u001b[0m codec can't decode byte \u001b[1;36m0xca\u001b[0m in position \u001b[1;36m0\u001b[0m: invalid continuation byte\n",
+ "\n",
+ "\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n",
+ "\n",
+ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+ "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 loader.load() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m84\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m82 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mlogger.warning(e) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m83 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m84 \u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m e \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m85 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mfinally\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m86 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m pbar: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m87 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mpbar.update(\u001b[94m1\u001b[0m) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m78\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m75 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m i.is_file(): \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m76 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m _is_visible(i.relative_to(p)) \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m.load_hidden: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m77 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m78 \u001b[2m│ │ │ │ │ │ \u001b[0msub_docs = \u001b[96mself\u001b[0m.loader_cls(\u001b[96mstr\u001b[0m(i), **\u001b[96mself\u001b[0m.loader_kwargs).load() \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m79 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mdocs.extend(sub_docs) \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m80 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m44\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m41 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m42 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mcontinue\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m43 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m44 \u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m45 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m46 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m│\u001b[0m \u001b[2m47 \u001b[0m \u001b[31m│\u001b[0m\n",
+ "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+ "\u001b[1;91mRuntimeError: \u001b[0mError loading ..\u001b[35m/../../../../tests/integration_tests/examples/\u001b[0m\u001b[95mexample-non-utf8.txt\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "loader.load()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "da554f9a",
+ "metadata": {},
+ "source": [
+ "The file `example-non-utf8.txt` uses a different encoding the `load()` function fails with a helpful message indicating which file failed decoding. \n",
+ "\n",
+ "With the default behavior of `TextLoader` any failure to load any of the documents will fail the whole loading process and no documents are loaded. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eb844b7e",
+ "metadata": {},
+ "source": [
+ "### B. Silent fail\n",
+ "\n",
+ "We can pass the parameter `silent_errors` to the `DirectoryLoader` to skip the files which could not be loaded and continue the load process."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "5314ec39",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n"
+ ]
+ }
+ ],
+ "source": [
+ "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, silent_errors=True)\n",
+ "docs = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "216337e5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
+ " '../../../../../tests/integration_tests/examples/example-utf8.txt']"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "doc_sources = [doc.metadata['source'] for doc in docs]\n",
+ "doc_sources"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cba0e53",
+ "metadata": {},
+ "source": [
+ "### C. Auto detect encodings\n",
+ "\n",
+ "We can also ask `TextLoader` to auto detect the file encoding before failing, by passing the `autodetect_encoding` to the loader class."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "96d527d2",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "text_loader_kwargs={'autodetect_encoding': True}\n",
+ "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
+ "docs = loader.load()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "f1a136a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['../../../../../tests/integration_tests/examples/example-non-utf8.txt',\n",
+ " '../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
+ " '../../../../../tests/integration_tests/examples/example-utf8.txt']"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "doc_sources = [doc.metadata['source'] for doc in docs]\n",
+ "doc_sources"
+ ]
}
],
"metadata": {
@@ -284,7 +563,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.9.11"
}
},
"nbformat": 4,
diff --git a/langchain/document_loaders/helpers.py b/langchain/document_loaders/helpers.py
new file mode 100644
index 0000000000..3ccf4f7d77
--- /dev/null
+++ b/langchain/document_loaders/helpers.py
@@ -0,0 +1,37 @@
+"""Document loader helpers."""
+
+import concurrent.futures
+from typing import List, NamedTuple, Optional, cast
+
+
+class FileEncoding(NamedTuple):
+ encoding: Optional[str]
+ confidence: float
+ language: Optional[str]
+
+
+def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
+ """Try to detect the file encoding.
+
+ Returns a list of `FileEncoding` tuples with the detected encodings ordered
+ by confidence.
+ """
+ import chardet
+
+ def read_and_detect(file_path: str) -> List[dict]:
+ with open(file_path, "rb") as f:
+ rawdata = f.read()
+ return cast(List[dict], chardet.detect_all(rawdata))
+
+ with concurrent.futures.ThreadPoolExecutor() as executor:
+ future = executor.submit(read_and_detect, file_path)
+ try:
+ encodings = future.result(timeout=timeout)
+ except concurrent.futures.TimeoutError:
+ raise TimeoutError(
+ f"Timeout reached while detecting encoding for {file_path}"
+ )
+
+ if all(encoding["encoding"] is None for encoding in encodings):
+ raise RuntimeError(f"Could not detect encoding for {file_path}")
+ return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
diff --git a/langchain/document_loaders/text.py b/langchain/document_loaders/text.py
index ce7913d6d4..2b48115d89 100644
--- a/langchain/document_loaders/text.py
+++ b/langchain/document_loaders/text.py
@@ -1,20 +1,59 @@
+import logging
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+
+logger = logging.getLogger(__name__)
class TextLoader(BaseLoader):
- """Load text files."""
+ """Load text files.
+
+
+ Args:
+ file_path: Path to the file to load.
- def __init__(self, file_path: str, encoding: Optional[str] = None):
+ encoding: File encoding to use. If `None`, the file will be loaded
+ with the default system encoding.
+
+ autodetect_encoding: Whether to try to autodetect the file encoding
+ if the specified encoding fails.
+ """
+
+ def __init__(
+ self,
+ file_path: str,
+ encoding: Optional[str] = None,
+ autodetect_encoding: bool = False,
+ ):
"""Initialize with file path."""
self.file_path = file_path
self.encoding = encoding
+ self.autodetect_encoding = autodetect_encoding
def load(self) -> List[Document]:
"""Load from file path."""
- with open(self.file_path, encoding=self.encoding) as f:
- text = f.read()
+ text = ""
+ try:
+ with open(self.file_path, encoding=self.encoding) as f:
+ text = f.read()
+ except UnicodeDecodeError as e:
+ if self.autodetect_encoding:
+ detected_encodings = detect_file_encodings(self.file_path)
+ for encoding in detected_encodings:
+ logger.debug("Trying encoding: ", encoding.encoding)
+ try:
+ with open(self.file_path, encoding=encoding.encoding) as f:
+ text = f.read()
+ break
+ except UnicodeDecodeError:
+ continue
+ else:
+ raise RuntimeError(f"Error loading {self.file_path}") from e
+ except Exception as e:
+ raise RuntimeError(f"Error loading {self.file_path}") from e
+
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
diff --git a/poetry.lock b/poetry.lock
index cb324da198..afef457d9c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1022,6 +1022,18 @@ files = [
[package.dependencies]
pycparser = "*"
+[[package]]
+name = "chardet"
+version = "5.1.0"
+description = "Universal encoding detector for Python 3"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+files = [
+ {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
+ {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
+]
+
[[package]]
name = "charset-normalizer"
version = "3.1.0"
@@ -1510,13 +1522,13 @@ files = [
[[package]]
name = "deeplake"
-version = "3.4.4"
+version = "3.5.0"
description = "Activeloop Deep Lake"
category = "main"
optional = false
python-versions = "*"
files = [
- {file = "deeplake-3.4.4.tar.gz", hash = "sha256:7d044a89862fe6aa4a93abc7db90090be7ecfc611169916e3975e0cd187be1ac"},
+ {file = "deeplake-3.5.0.tar.gz", hash = "sha256:ae640f75b1fec4eed9598a4e8d6b80e7243af87d9f39f0d34f01ce2c6f7c194f"},
]
[package.dependencies]
@@ -3138,13 +3150,13 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
[[package]]
name = "jcloud"
-version = "0.2.8"
+version = "0.2.9"
description = "Simplify deploying and managing Jina projects on Jina Cloud"
category = "main"
optional = true
python-versions = "*"
files = [
- {file = "jcloud-0.2.8.tar.gz", hash = "sha256:cdd30c85c0a857573651ebc329f52a8de9e43c3e0f276dc85975914006295639"},
+ {file = "jcloud-0.2.9.tar.gz", hash = "sha256:eaf8da685f8907e153ff752e6a4b945aeff548b8d94dfd650a035d8450ed547e"},
]
[package.dependencies]
@@ -3297,14 +3309,14 @@ websockets = ["websockets"]
[[package]]
name = "jina-hubble-sdk"
-version = "0.36.0"
+version = "0.37.1"
description = "SDK for Hubble API at Jina AI."
category = "main"
optional = true
python-versions = ">=3.7.0"
files = [
- {file = "jina-hubble-sdk-0.36.0.tar.gz", hash = "sha256:ba1a72c7a5c14963fdad9af1ff4c3bba26a03ddcced08111bd11a95b153249ec"},
- {file = "jina_hubble_sdk-0.36.0-py3-none-any.whl", hash = "sha256:56db142147d7c72142ed1b6505020c3b4d8070b1603d07ff796e56d491dde294"},
+ {file = "jina-hubble-sdk-0.37.1.tar.gz", hash = "sha256:5b6bd9e13f97c8c77be822e9ae49f87a0f16ef8195011f25db2552006a5ca2a0"},
+ {file = "jina_hubble_sdk-0.37.1-py3-none-any.whl", hash = "sha256:bc54a60ed120508e231fbed28b0fff394c288312d2ffa865c7865c03dbbbb502"},
]
[package.dependencies]
@@ -6608,6 +6620,7 @@ files = [
{file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"},
{file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"},
{file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"},
+ {file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"},
]
[package.dependencies]
@@ -9367,6 +9380,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+[[package]]
+name = "types-chardet"
+version = "5.0.4.6"
+description = "Typing stubs for chardet"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+ {file = "types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818"},
+ {file = "types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d"},
+]
+
[[package]]
name = "types-pyopenssl"
version = "23.1.0.3"
@@ -9762,14 +9787,14 @@ files = [
[[package]]
name = "weaviate-client"
-version = "3.18.0"
+version = "3.19.1"
description = "A python native weaviate client"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
- {file = "weaviate-client-3.18.0.tar.gz", hash = "sha256:423a526518a32505c5293328e5f252e6cbbf20e4b3124733f70d10fc0d6823c9"},
- {file = "weaviate_client-3.18.0-py3-none-any.whl", hash = "sha256:42b324286a4b4436317e5d2c6ba48c07da6cf01518efdd47ee097e7a8cc7584c"},
+ {file = "weaviate-client-3.19.1.tar.gz", hash = "sha256:8528926b0b545225ab75583481d67cccf9494d2dc01cb62f1165a8f187b41ebb"},
+ {file = "weaviate_client-3.19.1-py3-none-any.whl", hash = "sha256:6b4aae86cd955543fcc6328bc1462fbbae053dc50f7b6822287b05b98fec0d27"},
]
[package.dependencies]
@@ -10309,14 +10334,15 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
cohere = ["cohere"]
embeddings = ["sentence-transformers"]
-extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
+extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
hnswlib = ["docarray", "hnswlib", "protobuf"]
in-memory-store = ["docarray"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
openai = ["openai", "tiktoken"]
qdrant = ["qdrant-client"]
+text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
-content-hash = "055d65314e800e0731b086471d357e5fb6bbd265ee9fa2bd3762470152cc3b85"
+content-hash = "2b19b9deca7f83ca14af1f7bc7808bbe7873a91ce4c95381eaad8ea84fe04c0b"
diff --git a/pyproject.toml b/pyproject.toml
index 7c86d733e6..0164205464 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,6 +89,8 @@ gql = {version = "^3.4.1", optional = true}
pandas = {version = "^2.0.1", optional = true}
telethon = {version = "^1.28.5", optional = true}
zep-python = {version="^0.25", optional=true}
+chardet = {version="^5.1.0", optional=true}
+
[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"
@@ -156,6 +158,7 @@ ruff = "^0.0.249"
types-toml = "^0.10.8.1"
types-redis = "^4.3.21.6"
black = "^23.1.0"
+types-chardet = "^5.0.4.6"
[tool.poetry.group.typing.dependencies]
mypy = "^0.991"
@@ -174,6 +177,7 @@ setuptools = "^67.6.1"
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
qdrant = ["qdrant-client"]
openai = ["openai", "tiktoken"]
+text_helpers = ["chardet"]
cohere = ["cohere"]
in_memory_store = ["docarray"]
hnswlib = ["docarray", "protobuf", "hnswlib"]
@@ -185,6 +189,7 @@ all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "
# merge-conflicts
extended_testing = [
"beautifulsoup4",
+ "chardet",
"jq",
"pdfminer.six",
"pypdf",
diff --git a/tests/unit_tests/document_loaders/test_detect_encoding.py b/tests/unit_tests/document_loaders/test_detect_encoding.py
new file mode 100644
index 0000000000..5cee5cd875
--- /dev/null
+++ b/tests/unit_tests/document_loaders/test_detect_encoding.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders import DirectoryLoader, TextLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+
+
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding() -> None:
+ """Test text loader."""
+ path = Path(__file__).parent.parent / "examples"
+ files = path.glob("**/*.txt")
+ loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
+ loader_detect_encoding = DirectoryLoader(
+ str(path),
+ glob="**/*.txt",
+ loader_kwargs={"autodetect_encoding": True},
+ loader_cls=TextLoader,
+ )
+
+ with pytest.raises((UnicodeDecodeError, RuntimeError)):
+ loader.load()
+
+ docs = loader_detect_encoding.load()
+ assert len(docs) == len(list(files))
+
+
+@pytest.mark.skip(reason="slow test")
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
+ path = Path(tmpdir)
+ file_path = str(path / "blob.txt")
+ # 2mb binary blob
+ with open(file_path, "wb") as f:
+ f.write(b"\x00" * 2_000_000)
+
+ with pytest.raises(TimeoutError):
+ detect_file_encodings(file_path, timeout=1)
+
+ detect_file_encodings(file_path, timeout=10)
diff --git a/tests/unit_tests/examples/example-non-utf8.txt b/tests/unit_tests/examples/example-non-utf8.txt
new file mode 100644
index 0000000000..60cbb2073e
--- /dev/null
+++ b/tests/unit_tests/examples/example-non-utf8.txt
@@ -0,0 +1 @@
+-
diff --git a/tests/unit_tests/examples/example-utf8.txt b/tests/unit_tests/examples/example-utf8.txt
new file mode 100644
index 0000000000..1bb51996cd
--- /dev/null
+++ b/tests/unit_tests/examples/example-utf8.txt
@@ -0,0 +1,6 @@
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
+incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
+nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
+fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
+culpa qui officia deserunt mollit anim id est laborum.