From e46202829f30cf03ff25254adccef06184ffdcba Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 18 May 2023 09:55:14 -0400 Subject: [PATCH] feat #4479: TextLoader auto detect encoding and improved exceptions (#4927) # TextLoader auto detect encoding and enhanced exception handling - Add an option to enable encoding detection on `TextLoader`. - The detection is done using `chardet` - The loading is done by trying all detected encodings by order of confidence or raise an exception otherwise. ### New Dependencies: - `chardet` Fixes #4479 ## Before submitting ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: - @eyurtsev --------- Co-authored-by: blob42 --- .../examples/file_directory.ipynb | 293 +++++++++++++++++- langchain/document_loaders/helpers.py | 37 +++ langchain/document_loaders/text.py | 47 ++- poetry.lock | 50 ++- pyproject.toml | 5 + .../document_loaders/test_detect_encoding.py | 41 +++ .../unit_tests/examples/example-non-utf8.txt | 1 + tests/unit_tests/examples/example-utf8.txt | 6 + 8 files changed, 457 insertions(+), 23 deletions(-) create mode 100644 langchain/document_loaders/helpers.py create mode 100644 tests/unit_tests/document_loaders/test_detect_encoding.py create mode 100644 tests/unit_tests/examples/example-non-utf8.txt create mode 100644 tests/unit_tests/examples/example-utf8.txt diff --git a/docs/modules/indexes/document_loaders/examples/file_directory.ipynb b/docs/modules/indexes/document_loaders/examples/file_directory.ipynb index 996f8f9d..93b70aad 100644 --- a/docs/modules/indexes/document_loaders/examples/file_directory.ipynb +++ b/docs/modules/indexes/document_loaders/examples/file_directory.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "019d8520", "metadata": {}, "outputs": [], @@ -151,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "id": "81c92da3", "metadata": {}, "outputs": [], @@ -210,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "id": "c558bd73", "metadata": {}, "outputs": [], @@ -259,13 +259,292 @@ "len(docs)" ] }, + { + "cell_type": "markdown", + "id": "6411a0cb", + "metadata": {}, + "source": [ + "## Auto detect file encodings with TextLoader\n", + "\n", + "In this example we will see some strategies that can be useful when loading a big list of arbitrary files from a directory using the `TextLoader` class.\n", + "\n", + "First to illustrate the problem, let's try to load multiple text with arbitrary encodings." + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "6a91a0bc", + "execution_count": 16, + "id": "2c787a69", + "metadata": {}, + "outputs": [], + "source": [ + "path = '../../../../../tests/integration_tests/examples'\n", + "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader)" + ] + }, + { + "cell_type": "markdown", + "id": "e9001e12", + "metadata": {}, + "source": [ + "### A. Default Behavior" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b1e88c31", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " /data/source/langchain/langchain/document_loaders/text.py:29 in load                             \n",
+       "                                                                                                  \n",
+       "   26 │   │   text = \"\"                                                                           \n",
+       "   27 │   │   with open(self.file_path, encoding=self.encoding) as f:                             \n",
+       "   28 │   │   │   try:                                                                            \n",
+       " 29 │   │   │   │   text = f.read()                                                             \n",
+       "   30 │   │   │   except UnicodeDecodeError as e:                                                 \n",
+       "   31 │   │   │   │   if self.autodetect_encoding:                                                \n",
+       "   32 │   │   │   │   │   detected_encodings = self.detect_file_encodings()                       \n",
+       "                                                                                                  \n",
+       " /home/spike/.pyenv/versions/3.9.11/lib/python3.9/codecs.py:322 in decode                         \n",
+       "                                                                                                  \n",
+       "    319 def decode(self, input, final=False):                                                 \n",
+       "    320 │   │   # decode input (taking the buffer into account)                                   \n",
+       "    321 │   │   data = self.buffer + input                                                        \n",
+       "  322 │   │   (result, consumed) = self._buffer_decode(data, self.errors, final)                \n",
+       "    323 │   │   # keep undecoded input until the next call                                        \n",
+       "    324 │   │   self.buffer = data[consumed:]                                                     \n",
+       "    325 │   │   return result                                                                     \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xca in position 0: invalid continuation byte\n",
+       "\n",
+       "The above exception was the direct cause of the following exception:\n",
+       "\n",
+       "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:1                                                                                    \n",
+       "                                                                                                  \n",
+       " 1 loader.load()                                                                                \n",
+       "   2                                                                                              \n",
+       "                                                                                                  \n",
+       " /data/source/langchain/langchain/document_loaders/directory.py:84 in load                        \n",
+       "                                                                                                  \n",
+       "   81 │   │   │   │   │   │   if self.silent_errors:                                              \n",
+       "   82 │   │   │   │   │   │   │   logger.warning(e)                                               \n",
+       "   83 │   │   │   │   │   │   else:                                                               \n",
+       " 84 │   │   │   │   │   │   │   raise e                                                         \n",
+       "   85 │   │   │   │   │   finally:                                                                \n",
+       "   86 │   │   │   │   │   │   if pbar:                                                            \n",
+       "   87 │   │   │   │   │   │   │   pbar.update(1)                                                  \n",
+       "                                                                                                  \n",
+       " /data/source/langchain/langchain/document_loaders/directory.py:78 in load                        \n",
+       "                                                                                                  \n",
+       "   75 │   │   │   if i.is_file():                                                                 \n",
+       "   76 │   │   │   │   if _is_visible(i.relative_to(p)) or self.load_hidden:                       \n",
+       "   77 │   │   │   │   │   try:                                                                    \n",
+       " 78 │   │   │   │   │   │   sub_docs = self.loader_cls(str(i), **self.loader_kwargs).load()     \n",
+       "   79 │   │   │   │   │   │   docs.extend(sub_docs)                                               \n",
+       "   80 │   │   │   │   │   except Exception as e:                                                  \n",
+       "   81 │   │   │   │   │   │   if self.silent_errors:                                              \n",
+       "                                                                                                  \n",
+       " /data/source/langchain/langchain/document_loaders/text.py:44 in load                             \n",
+       "                                                                                                  \n",
+       "   41 │   │   │   │   │   │   except UnicodeDecodeError:                                          \n",
+       "   42 │   │   │   │   │   │   │   continue                                                        \n",
+       "   43 │   │   │   │   else:                                                                       \n",
+       " 44 │   │   │   │   │   raise RuntimeError(f\"Error loading {self.file_path}\") from e            \n",
+       "   45 │   │   │   except Exception as e:                                                          \n",
+       "   46 │   │   │   │   raise RuntimeError(f\"Error loading {self.file_path}\") from e                \n",
+       "   47                                                                                             \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "RuntimeError: Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m29\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m26 \u001b[0m\u001b[2m│ │ \u001b[0mtext = \u001b[33m\"\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m27 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mopen\u001b[0m(\u001b[96mself\u001b[0m.file_path, encoding=\u001b[96mself\u001b[0m.encoding) \u001b[94mas\u001b[0m f: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m28 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m29 \u001b[2m│ │ │ │ \u001b[0mtext = f.read() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m30 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m31 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.autodetect_encoding: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m32 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mdetected_encodings = \u001b[96mself\u001b[0m.detect_file_encodings() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/home/spike/.pyenv/versions/3.9.11/lib/python3.9/\u001b[0m\u001b[1;33mcodecs.py\u001b[0m:\u001b[94m322\u001b[0m in \u001b[92mdecode\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 319 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdecode\u001b[0m(\u001b[96mself\u001b[0m, \u001b[96minput\u001b[0m, final=\u001b[94mFalse\u001b[0m): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 320 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# decode input (taking the buffer into account)\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 321 \u001b[0m\u001b[2m│ │ \u001b[0mdata = \u001b[96mself\u001b[0m.buffer + \u001b[96minput\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 322 \u001b[2m│ │ \u001b[0m(result, consumed) = \u001b[96mself\u001b[0m._buffer_decode(data, \u001b[96mself\u001b[0m.errors, final) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 323 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# keep undecoded input until the next call\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 324 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.buffer = data[consumed:] \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 325 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m result \u001b[31m│\u001b[0m\n", + "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mUnicodeDecodeError: \u001b[0m\u001b[32m'utf-8'\u001b[0m codec can't decode byte \u001b[1;36m0xca\u001b[0m in position \u001b[1;36m0\u001b[0m: invalid continuation byte\n", + "\n", + "\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n", + "\n", + "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", + "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 loader.load() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m84\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m82 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mlogger.warning(e) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m83 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m84 \u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m e \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m85 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mfinally\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m86 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m pbar: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m87 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0mpbar.update(\u001b[94m1\u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m78\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m75 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m i.is_file(): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m76 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m _is_visible(i.relative_to(p)) \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m.load_hidden: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m77 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m78 \u001b[2m│ │ │ │ │ │ \u001b[0msub_docs = \u001b[96mself\u001b[0m.loader_cls(\u001b[96mstr\u001b[0m(i), **\u001b[96mself\u001b[0m.loader_kwargs).load() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m79 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mdocs.extend(sub_docs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m80 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m81 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m44\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m41 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m42 \u001b[0m\u001b[2m│ │ │ │ │ │ │ \u001b[0m\u001b[94mcontinue\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m43 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m44 \u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m45 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m46 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m47 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mRuntimeError: \u001b[0mError loading ..\u001b[35m/../../../../tests/integration_tests/examples/\u001b[0m\u001b[95mexample-non-utf8.txt\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "da554f9a", + "metadata": {}, + "source": [ + "The file `example-non-utf8.txt` uses a different encoding the `load()` function fails with a helpful message indicating which file failed decoding. \n", + "\n", + "With the default behavior of `TextLoader` any failure to load any of the documents will fail the whole loading process and no documents are loaded. " + ] + }, + { + "cell_type": "markdown", + "id": "eb844b7e", + "metadata": {}, + "source": [ + "### B. Silent fail\n", + "\n", + "We can pass the parameter `silent_errors` to the `DirectoryLoader` to skip the files which could not be loaded and continue the load process." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5314ec39", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n" + ] + } + ], + "source": [ + "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, silent_errors=True)\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "216337e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n", + " '../../../../../tests/integration_tests/examples/example-utf8.txt']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc_sources = [doc.metadata['source'] for doc in docs]\n", + "doc_sources" + ] + }, + { + "cell_type": "markdown", + "id": "4cba0e53", + "metadata": {}, + "source": [ + "### C. Auto detect encodings\n", + "\n", + "We can also ask `TextLoader` to auto detect the file encoding before failing, by passing the `autodetect_encoding` to the loader class." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "96d527d2", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "text_loader_kwargs={'autodetect_encoding': True}\n", + "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", + "docs = loader.load()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "f1a136a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['../../../../../tests/integration_tests/examples/example-non-utf8.txt',\n", + " '../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n", + " '../../../../../tests/integration_tests/examples/example-utf8.txt']" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "doc_sources = [doc.metadata['source'] for doc in docs]\n", + "doc_sources" + ] } ], "metadata": { @@ -284,7 +563,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.11" } }, "nbformat": 4, diff --git a/langchain/document_loaders/helpers.py b/langchain/document_loaders/helpers.py new file mode 100644 index 00000000..3ccf4f7d --- /dev/null +++ b/langchain/document_loaders/helpers.py @@ -0,0 +1,37 @@ +"""Document loader helpers.""" + +import concurrent.futures +from typing import List, NamedTuple, Optional, cast + + +class FileEncoding(NamedTuple): + encoding: Optional[str] + confidence: float + language: Optional[str] + + +def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]: + """Try to detect the file encoding. + + Returns a list of `FileEncoding` tuples with the detected encodings ordered + by confidence. + """ + import chardet + + def read_and_detect(file_path: str) -> List[dict]: + with open(file_path, "rb") as f: + rawdata = f.read() + return cast(List[dict], chardet.detect_all(rawdata)) + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(read_and_detect, file_path) + try: + encodings = future.result(timeout=timeout) + except concurrent.futures.TimeoutError: + raise TimeoutError( + f"Timeout reached while detecting encoding for {file_path}" + ) + + if all(encoding["encoding"] is None for encoding in encodings): + raise RuntimeError(f"Could not detect encoding for {file_path}") + return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None] diff --git a/langchain/document_loaders/text.py b/langchain/document_loaders/text.py index ce7913d6..2b48115d 100644 --- a/langchain/document_loaders/text.py +++ b/langchain/document_loaders/text.py @@ -1,20 +1,59 @@ +import logging from typing import List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.helpers import detect_file_encodings + +logger = logging.getLogger(__name__) class TextLoader(BaseLoader): - """Load text files.""" + """Load text files. + + + Args: + file_path: Path to the file to load. - def __init__(self, file_path: str, encoding: Optional[str] = None): + encoding: File encoding to use. If `None`, the file will be loaded + with the default system encoding. + + autodetect_encoding: Whether to try to autodetect the file encoding + if the specified encoding fails. + """ + + def __init__( + self, + file_path: str, + encoding: Optional[str] = None, + autodetect_encoding: bool = False, + ): """Initialize with file path.""" self.file_path = file_path self.encoding = encoding + self.autodetect_encoding = autodetect_encoding def load(self) -> List[Document]: """Load from file path.""" - with open(self.file_path, encoding=self.encoding) as f: - text = f.read() + text = "" + try: + with open(self.file_path, encoding=self.encoding) as f: + text = f.read() + except UnicodeDecodeError as e: + if self.autodetect_encoding: + detected_encodings = detect_file_encodings(self.file_path) + for encoding in detected_encodings: + logger.debug("Trying encoding: ", encoding.encoding) + try: + with open(self.file_path, encoding=encoding.encoding) as f: + text = f.read() + break + except UnicodeDecodeError: + continue + else: + raise RuntimeError(f"Error loading {self.file_path}") from e + except Exception as e: + raise RuntimeError(f"Error loading {self.file_path}") from e + metadata = {"source": self.file_path} return [Document(page_content=text, metadata=metadata)] diff --git a/poetry.lock b/poetry.lock index cb324da1..afef457d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1022,6 +1022,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "chardet" +version = "5.1.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"}, + {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"}, +] + [[package]] name = "charset-normalizer" version = "3.1.0" @@ -1510,13 +1522,13 @@ files = [ [[package]] name = "deeplake" -version = "3.4.4" +version = "3.5.0" description = "Activeloop Deep Lake" category = "main" optional = false python-versions = "*" files = [ - {file = "deeplake-3.4.4.tar.gz", hash = "sha256:7d044a89862fe6aa4a93abc7db90090be7ecfc611169916e3975e0cd187be1ac"}, + {file = "deeplake-3.5.0.tar.gz", hash = "sha256:ae640f75b1fec4eed9598a4e8d6b80e7243af87d9f39f0d34f01ce2c6f7c194f"}, ] [package.dependencies] @@ -3138,13 +3150,13 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec [[package]] name = "jcloud" -version = "0.2.8" +version = "0.2.9" description = "Simplify deploying and managing Jina projects on Jina Cloud" category = "main" optional = true python-versions = "*" files = [ - {file = "jcloud-0.2.8.tar.gz", hash = "sha256:cdd30c85c0a857573651ebc329f52a8de9e43c3e0f276dc85975914006295639"}, + {file = "jcloud-0.2.9.tar.gz", hash = "sha256:eaf8da685f8907e153ff752e6a4b945aeff548b8d94dfd650a035d8450ed547e"}, ] [package.dependencies] @@ -3297,14 +3309,14 @@ websockets = ["websockets"] [[package]] name = "jina-hubble-sdk" -version = "0.36.0" +version = "0.37.1" description = "SDK for Hubble API at Jina AI." category = "main" optional = true python-versions = ">=3.7.0" files = [ - {file = "jina-hubble-sdk-0.36.0.tar.gz", hash = "sha256:ba1a72c7a5c14963fdad9af1ff4c3bba26a03ddcced08111bd11a95b153249ec"}, - {file = "jina_hubble_sdk-0.36.0-py3-none-any.whl", hash = "sha256:56db142147d7c72142ed1b6505020c3b4d8070b1603d07ff796e56d491dde294"}, + {file = "jina-hubble-sdk-0.37.1.tar.gz", hash = "sha256:5b6bd9e13f97c8c77be822e9ae49f87a0f16ef8195011f25db2552006a5ca2a0"}, + {file = "jina_hubble_sdk-0.37.1-py3-none-any.whl", hash = "sha256:bc54a60ed120508e231fbed28b0fff394c288312d2ffa865c7865c03dbbbb502"}, ] [package.dependencies] @@ -6608,6 +6620,7 @@ files = [ {file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"}, {file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"}, {file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"}, + {file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"}, ] [package.dependencies] @@ -9367,6 +9380,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2 doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +[[package]] +name = "types-chardet" +version = "5.0.4.6" +description = "Typing stubs for chardet" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818"}, + {file = "types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d"}, +] + [[package]] name = "types-pyopenssl" version = "23.1.0.3" @@ -9762,14 +9787,14 @@ files = [ [[package]] name = "weaviate-client" -version = "3.18.0" +version = "3.19.1" description = "A python native weaviate client" category = "main" optional = false python-versions = ">=3.8" files = [ - {file = "weaviate-client-3.18.0.tar.gz", hash = "sha256:423a526518a32505c5293328e5f252e6cbbf20e4b3124733f70d10fc0d6823c9"}, - {file = "weaviate_client-3.18.0-py3-none-any.whl", hash = "sha256:42b324286a4b4436317e5d2c6ba48c07da6cf01518efdd47ee097e7a8cc7584c"}, + {file = "weaviate-client-3.19.1.tar.gz", hash = "sha256:8528926b0b545225ab75583481d67cccf9494d2dc01cb62f1165a8f187b41ebb"}, + {file = "weaviate_client-3.19.1-py3-none-any.whl", hash = "sha256:6b4aae86cd955543fcc6328bc1462fbbae053dc50f7b6822287b05b98fec0d27"}, ] [package.dependencies] @@ -10309,14 +10334,15 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"] hnswlib = ["docarray", "hnswlib", "protobuf"] in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] +text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "055d65314e800e0731b086471d357e5fb6bbd265ee9fa2bd3762470152cc3b85" +content-hash = "2b19b9deca7f83ca14af1f7bc7808bbe7873a91ce4c95381eaad8ea84fe04c0b" diff --git a/pyproject.toml b/pyproject.toml index 7c86d733..01642054 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,8 @@ gql = {version = "^3.4.1", optional = true} pandas = {version = "^2.0.1", optional = true} telethon = {version = "^1.28.5", optional = true} zep-python = {version="^0.25", optional=true} +chardet = {version="^5.1.0", optional=true} + [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" @@ -156,6 +158,7 @@ ruff = "^0.0.249" types-toml = "^0.10.8.1" types-redis = "^4.3.21.6" black = "^23.1.0" +types-chardet = "^5.0.4.6" [tool.poetry.group.typing.dependencies] mypy = "^0.991" @@ -174,6 +177,7 @@ setuptools = "^67.6.1" llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] qdrant = ["qdrant-client"] openai = ["openai", "tiktoken"] +text_helpers = ["chardet"] cohere = ["cohere"] in_memory_store = ["docarray"] hnswlib = ["docarray", "protobuf", "hnswlib"] @@ -185,6 +189,7 @@ all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", " # merge-conflicts extended_testing = [ "beautifulsoup4", + "chardet", "jq", "pdfminer.six", "pypdf", diff --git a/tests/unit_tests/document_loaders/test_detect_encoding.py b/tests/unit_tests/document_loaders/test_detect_encoding.py new file mode 100644 index 00000000..5cee5cd8 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_detect_encoding.py @@ -0,0 +1,41 @@ +from pathlib import Path + +import pytest + +from langchain.document_loaders import DirectoryLoader, TextLoader +from langchain.document_loaders.helpers import detect_file_encodings + + +@pytest.mark.requires("chardet") +def test_loader_detect_encoding() -> None: + """Test text loader.""" + path = Path(__file__).parent.parent / "examples" + files = path.glob("**/*.txt") + loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader) + loader_detect_encoding = DirectoryLoader( + str(path), + glob="**/*.txt", + loader_kwargs={"autodetect_encoding": True}, + loader_cls=TextLoader, + ) + + with pytest.raises((UnicodeDecodeError, RuntimeError)): + loader.load() + + docs = loader_detect_encoding.load() + assert len(docs) == len(list(files)) + + +@pytest.mark.skip(reason="slow test") +@pytest.mark.requires("chardet") +def test_loader_detect_encoding_timeout(tmpdir: str) -> None: + path = Path(tmpdir) + file_path = str(path / "blob.txt") + # 2mb binary blob + with open(file_path, "wb") as f: + f.write(b"\x00" * 2_000_000) + + with pytest.raises(TimeoutError): + detect_file_encodings(file_path, timeout=1) + + detect_file_encodings(file_path, timeout=10) diff --git a/tests/unit_tests/examples/example-non-utf8.txt b/tests/unit_tests/examples/example-non-utf8.txt new file mode 100644 index 00000000..60cbb207 --- /dev/null +++ b/tests/unit_tests/examples/example-non-utf8.txt @@ -0,0 +1 @@ +- diff --git a/tests/unit_tests/examples/example-utf8.txt b/tests/unit_tests/examples/example-utf8.txt new file mode 100644 index 00000000..1bb51996 --- /dev/null +++ b/tests/unit_tests/examples/example-utf8.txt @@ -0,0 +1,6 @@ +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum.