From e46202829f30cf03ff25254adccef06184ffdcba Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Thu, 18 May 2023 09:55:14 -0400
Subject: [PATCH] feat #4479: TextLoader auto detect encoding and improved
 exceptions (#4927)

# TextLoader auto detect encoding and enhanced exception handling

- Add an option to enable encoding detection on `TextLoader`.
- The detection is done using `chardet`
- The loading is done by trying all detected encodings by order of
confidence or raise an exception otherwise.

### New Dependencies:
- `chardet`

Fixes #4479

## Before submitting

<!-- If you're adding a new integration, include an integration test and
an example notebook showing its use! -->

## Who can review?

Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:

- @eyurtsev

---------

Co-authored-by: blob42 <spike@w530>
---
 .../examples/file_directory.ipynb             | 293 +++++++++++++++++-
 langchain/document_loaders/helpers.py         |  37 +++
 langchain/document_loaders/text.py            |  47 ++-
 poetry.lock                                   |  50 ++-
 pyproject.toml                                |   5 +
 .../document_loaders/test_detect_encoding.py  |  41 +++
 .../unit_tests/examples/example-non-utf8.txt  |   1 +
 tests/unit_tests/examples/example-utf8.txt    |   6 +
 8 files changed, 457 insertions(+), 23 deletions(-)
 create mode 100644 langchain/document_loaders/helpers.py
 create mode 100644 tests/unit_tests/document_loaders/test_detect_encoding.py
 create mode 100644 tests/unit_tests/examples/example-non-utf8.txt
 create mode 100644 tests/unit_tests/examples/example-utf8.txt
diff --git a/docs/modules/indexes/document_loaders/examples/file_directory.ipynb b/docs/modules/indexes/document_loaders/examples/file_directory.ipynb
index 996f8f9d..93b70aad 100644
--- a/docs/modules/indexes/document_loaders/examples/file_directory.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/file_directory.ipynb
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "id": "019d8520",
    "metadata": {},
    "outputs": [],
@@ -151,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 15,
    "id": "81c92da3",
    "metadata": {},
    "outputs": [],
@@ -210,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 14,
    "id": "c558bd73",
    "metadata": {},
    "outputs": [],
@@ -259,13 +259,292 @@
     "len(docs)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6411a0cb",
+   "metadata": {},
+   "source": [
+    "## Auto detect file encodings with TextLoader\n",
+    "\n",
+    "In this example we will see some strategies that can be useful when loading a big list of arbitrary files from a directory using the `TextLoader` class.\n",
+    "\n",
+    "First to illustrate the problem, let's try to load multiple text with arbitrary encodings."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "6a91a0bc",
+   "execution_count": 16,
+   "id": "2c787a69",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = '../../../../../tests/integration_tests/examples'\n",
+    "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9001e12",
+   "metadata": {},
+   "source": [
+    "### A. Default Behavior"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b1e88c31",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">text.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">29</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span>                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">26 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>text = <span style=\"color: #808000; text-decoration-color: #808000\">\"\"</span>                                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">27 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">with</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">open</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path, encoding=<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.encoding) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> f:                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">28 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>:                                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>29 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span>text = f.read()                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">30 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">UnicodeDecodeError</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e:                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">31 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.autodetect_encoding:                                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">32 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span>detected_encodings = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.detect_file_encodings()                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/home/spike/.pyenv/versions/3.9.11/lib/python3.9/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">codecs.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">322</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decode</span>                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 319 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">decode</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">input</span>, final=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">False</span>):                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 320 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># decode input (taking the buffer into account)</span>                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 321 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>data = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.buffer + <span style=\"color: #00ffff; text-decoration-color: #00ffff\">input</span>                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 322 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>(result, consumed) = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._buffer_decode(data, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.errors, final)                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 323 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># keep undecoded input until the next call</span>                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 324 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.buffer = data[consumed:]                                                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 325 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> result                                                                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">UnicodeDecodeError: </span><span style=\"color: #008000; text-decoration-color: #008000\">'utf-8'</span> codec can't decode byte <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0xca</span> in position <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0</span>: invalid continuation byte\n",
+       "\n",
+       "<span style=\"font-style: italic\">The above exception was the direct cause of the following exception:</span>\n",
+       "\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span>                                                                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1 loader.load()                                                                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2 </span>                                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">directory.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">84</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span>                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">81 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.silent_errors:                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">82 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   │   </span>logger.warning(e)                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">83 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>84 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> e                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">85 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">finally</span>:                                                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">86 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> pbar:                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">87 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   │   </span>pbar.update(<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span>)                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">directory.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">78</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span>                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">75 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> i.is_file():                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">76 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> _is_visible(i.relative_to(p)) <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">or</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.load_hidden:                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">77 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>:                                                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>78 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span>sub_docs = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.loader_cls(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">str</span>(i), **<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.loader_kwargs).load()     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">79 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span>docs.extend(sub_docs)                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">80 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">Exception</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e:                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">81 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.silent_errors:                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/data/source/langchain/langchain/document_loaders/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">text.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">44</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">load</span>                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">41 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">UnicodeDecodeError</span>:                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">42 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">continue</span>                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">43 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>44 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">RuntimeError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"Error loading {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">from</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; text-decoration: underline\">e</span>            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">45 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">Exception</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> e:                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">46 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">RuntimeError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"Error loading {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.file_path<span style=\"color: #808000; text-decoration-color: #808000\">}\"</span>) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">from</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff; text-decoration: underline\">e</span>                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">47 </span>                                                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">RuntimeError: </span>Error loading ..<span style=\"color: #800080; text-decoration-color: #800080\">/../../../../tests/integration_tests/examples/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">example-non-utf8.txt</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m29\u001b[0m in \u001b[92mload\u001b[0m                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m26 \u001b[0m\u001b[2m│   │   \u001b[0mtext = \u001b[33m\"\u001b[0m\u001b[33m\"\u001b[0m                                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m27 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mopen\u001b[0m(\u001b[96mself\u001b[0m.file_path, encoding=\u001b[96mself\u001b[0m.encoding) \u001b[94mas\u001b[0m f:                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m28 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mtry\u001b[0m:                                                                            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m29 \u001b[2m│   │   │   │   \u001b[0mtext = f.read()                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m30 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m \u001b[94mas\u001b[0m e:                                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m31 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.autodetect_encoding:                                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m32 \u001b[0m\u001b[2m│   │   │   │   │   \u001b[0mdetected_encodings = \u001b[96mself\u001b[0m.detect_file_encodings()                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/home/spike/.pyenv/versions/3.9.11/lib/python3.9/\u001b[0m\u001b[1;33mcodecs.py\u001b[0m:\u001b[94m322\u001b[0m in \u001b[92mdecode\u001b[0m                         \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 319 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdecode\u001b[0m(\u001b[96mself\u001b[0m, \u001b[96minput\u001b[0m, final=\u001b[94mFalse\u001b[0m):                                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 320 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[2m# decode input (taking the buffer into account)\u001b[0m                                   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 321 \u001b[0m\u001b[2m│   │   \u001b[0mdata = \u001b[96mself\u001b[0m.buffer + \u001b[96minput\u001b[0m                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 322 \u001b[2m│   │   \u001b[0m(result, consumed) = \u001b[96mself\u001b[0m._buffer_decode(data, \u001b[96mself\u001b[0m.errors, final)                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 323 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[2m# keep undecoded input until the next call\u001b[0m                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 324 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m.buffer = data[consumed:]                                                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 325 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mreturn\u001b[0m result                                                                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+       "\u001b[1;91mUnicodeDecodeError: \u001b[0m\u001b[32m'utf-8'\u001b[0m codec can't decode byte \u001b[1;36m0xca\u001b[0m in position \u001b[1;36m0\u001b[0m: invalid continuation byte\n",
+       "\n",
+       "\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n",
+       "\n",
+       "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+       "\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m:\u001b[94m1\u001b[0m                                                                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 loader.load()                                                                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2 \u001b[0m                                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m84\u001b[0m in \u001b[92mload\u001b[0m                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m81 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors:                                              \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m82 \u001b[0m\u001b[2m│   │   │   │   │   │   │   \u001b[0mlogger.warning(e)                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m83 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[94melse\u001b[0m:                                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m84 \u001b[2m│   │   │   │   │   │   │   \u001b[0m\u001b[94mraise\u001b[0m e                                                         \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m85 \u001b[0m\u001b[2m│   │   │   │   │   \u001b[0m\u001b[94mfinally\u001b[0m:                                                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m86 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[94mif\u001b[0m pbar:                                                            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m87 \u001b[0m\u001b[2m│   │   │   │   │   │   │   \u001b[0mpbar.update(\u001b[94m1\u001b[0m)                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mdirectory.py\u001b[0m:\u001b[94m78\u001b[0m in \u001b[92mload\u001b[0m                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m75 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mif\u001b[0m i.is_file():                                                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m76 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mif\u001b[0m _is_visible(i.relative_to(p)) \u001b[95mor\u001b[0m \u001b[96mself\u001b[0m.load_hidden:                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m77 \u001b[0m\u001b[2m│   │   │   │   │   \u001b[0m\u001b[94mtry\u001b[0m:                                                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m78 \u001b[2m│   │   │   │   │   │   \u001b[0msub_docs = \u001b[96mself\u001b[0m.loader_cls(\u001b[96mstr\u001b[0m(i), **\u001b[96mself\u001b[0m.loader_kwargs).load()     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m79 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0mdocs.extend(sub_docs)                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m80 \u001b[0m\u001b[2m│   │   │   │   │   \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e:                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m81 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.silent_errors:                                              \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/data/source/langchain/langchain/document_loaders/\u001b[0m\u001b[1;33mtext.py\u001b[0m:\u001b[94m44\u001b[0m in \u001b[92mload\u001b[0m                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m41 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mUnicodeDecodeError\u001b[0m:                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m42 \u001b[0m\u001b[2m│   │   │   │   │   │   │   \u001b[0m\u001b[94mcontinue\u001b[0m                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m43 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m44 \u001b[2m│   │   │   │   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m45 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e:                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m46 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mError loading \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.file_path\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m \u001b[4;96me\u001b[0m                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m47 \u001b[0m                                                                                            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+       "\u001b[1;91mRuntimeError: \u001b[0mError loading ..\u001b[35m/../../../../tests/integration_tests/examples/\u001b[0m\u001b[95mexample-non-utf8.txt\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "loader.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da554f9a",
+   "metadata": {},
+   "source": [
+    "The file `example-non-utf8.txt` uses a different encoding the `load()` function fails with a helpful message indicating which file failed decoding. \n",
+    "\n",
+    "With the default behavior of `TextLoader` any failure to load any of the documents will fail the whole loading process and no documents are loaded. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb844b7e",
+   "metadata": {},
+   "source": [
+    "### B. Silent fail\n",
+    "\n",
+    "We can pass the parameter `silent_errors` to the `DirectoryLoader` to skip the files which could not be loaded and continue the load process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "5314ec39",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Error loading ../../../../../tests/integration_tests/examples/example-non-utf8.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, silent_errors=True)\n",
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "216337e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
+       " '../../../../../tests/integration_tests/examples/example-utf8.txt']"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc_sources = [doc.metadata['source']  for doc in docs]\n",
+    "doc_sources"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cba0e53",
+   "metadata": {},
+   "source": [
+    "### C. Auto detect encodings\n",
+    "\n",
+    "We can also ask `TextLoader` to auto detect the file encoding before failing, by passing the `autodetect_encoding` to the loader class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "96d527d2",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "text_loader_kwargs={'autodetect_encoding': True}\n",
+    "loader = DirectoryLoader(path, glob=\"**/*.txt\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n",
+    "docs = loader.load()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "f1a136a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['../../../../../tests/integration_tests/examples/example-non-utf8.txt',\n",
+       " '../../../../../tests/integration_tests/examples/whatsapp_chat.txt',\n",
+       " '../../../../../tests/integration_tests/examples/example-utf8.txt']"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc_sources = [doc.metadata['source']  for doc in docs]\n",
+    "doc_sources"
+   ]
   }
  ],
  "metadata": {
@@ -284,7 +563,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.9.11"
   }
  },
  "nbformat": 4,
diff --git a/langchain/document_loaders/helpers.py b/langchain/document_loaders/helpers.py
new file mode 100644
index 00000000..3ccf4f7d
--- /dev/null
+++ b/langchain/document_loaders/helpers.py
@@ -0,0 +1,37 @@
+"""Document loader helpers."""
+
+import concurrent.futures
+from typing import List, NamedTuple, Optional, cast
+
+
+class FileEncoding(NamedTuple):
+    encoding: Optional[str]
+    confidence: float
+    language: Optional[str]
+
+
+def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
+    """Try to detect the file encoding.
+
+    Returns a list of `FileEncoding` tuples with the detected encodings ordered
+    by confidence.
+    """
+    import chardet
+
+    def read_and_detect(file_path: str) -> List[dict]:
+        with open(file_path, "rb") as f:
+            rawdata = f.read()
+        return cast(List[dict], chardet.detect_all(rawdata))
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future = executor.submit(read_and_detect, file_path)
+        try:
+            encodings = future.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(
+                f"Timeout reached while detecting encoding for {file_path}"
+            )
+
+    if all(encoding["encoding"] is None for encoding in encodings):
+        raise RuntimeError(f"Could not detect encoding for {file_path}")
+    return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]
diff --git a/langchain/document_loaders/text.py b/langchain/document_loaders/text.py
index ce7913d6..2b48115d 100644
--- a/langchain/document_loaders/text.py
+++ b/langchain/document_loaders/text.py
@@ -1,20 +1,59 @@
+import logging
 from typing import List, Optional
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+
+logger = logging.getLogger(__name__)
 
 
 class TextLoader(BaseLoader):
-    """Load text files."""
+    """Load text files.
+
+
+    Args:
+        file_path: Path to the file to load.
 
-    def __init__(self, file_path: str, encoding: Optional[str] = None):
+        encoding: File encoding to use. If `None`, the file will be loaded
+        with the default system encoding.
+
+        autodetect_encoding: Whether to try to autodetect the file encoding
+            if the specified encoding fails.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        encoding: Optional[str] = None,
+        autodetect_encoding: bool = False,
+    ):
         """Initialize with file path."""
         self.file_path = file_path
         self.encoding = encoding
+        self.autodetect_encoding = autodetect_encoding
 
     def load(self) -> List[Document]:
         """Load from file path."""
-        with open(self.file_path, encoding=self.encoding) as f:
-            text = f.read()
+        text = ""
+        try:
+            with open(self.file_path, encoding=self.encoding) as f:
+                text = f.read()
+        except UnicodeDecodeError as e:
+            if self.autodetect_encoding:
+                detected_encodings = detect_file_encodings(self.file_path)
+                for encoding in detected_encodings:
+                    logger.debug("Trying encoding: ", encoding.encoding)
+                    try:
+                        with open(self.file_path, encoding=encoding.encoding) as f:
+                            text = f.read()
+                        break
+                    except UnicodeDecodeError:
+                        continue
+            else:
+                raise RuntimeError(f"Error loading {self.file_path}") from e
+        except Exception as e:
+            raise RuntimeError(f"Error loading {self.file_path}") from e
+
         metadata = {"source": self.file_path}
         return [Document(page_content=text, metadata=metadata)]
diff --git a/poetry.lock b/poetry.lock
index cb324da1..afef457d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1022,6 +1022,18 @@ files = [
 [package.dependencies]
 pycparser = "*"
 
+[[package]]
+name = "chardet"
+version = "5.1.0"
+description = "Universal encoding detector for Python 3"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
+    {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.1.0"
@@ -1510,13 +1522,13 @@ files = [
 
 [[package]]
 name = "deeplake"
-version = "3.4.4"
+version = "3.5.0"
 description = "Activeloop Deep Lake"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "deeplake-3.4.4.tar.gz", hash = "sha256:7d044a89862fe6aa4a93abc7db90090be7ecfc611169916e3975e0cd187be1ac"},
+    {file = "deeplake-3.5.0.tar.gz", hash = "sha256:ae640f75b1fec4eed9598a4e8d6b80e7243af87d9f39f0d34f01ce2c6f7c194f"},
 ]
 
 [package.dependencies]
@@ -3138,13 +3150,13 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec
 
 [[package]]
 name = "jcloud"
-version = "0.2.8"
+version = "0.2.9"
 description = "Simplify deploying and managing Jina projects on Jina Cloud"
 category = "main"
 optional = true
 python-versions = "*"
 files = [
-    {file = "jcloud-0.2.8.tar.gz", hash = "sha256:cdd30c85c0a857573651ebc329f52a8de9e43c3e0f276dc85975914006295639"},
+    {file = "jcloud-0.2.9.tar.gz", hash = "sha256:eaf8da685f8907e153ff752e6a4b945aeff548b8d94dfd650a035d8450ed547e"},
 ]
 
 [package.dependencies]
@@ -3297,14 +3309,14 @@ websockets = ["websockets"]
 
 [[package]]
 name = "jina-hubble-sdk"
-version = "0.36.0"
+version = "0.37.1"
 description = "SDK for Hubble API at Jina AI."
 category = "main"
 optional = true
 python-versions = ">=3.7.0"
 files = [
-    {file = "jina-hubble-sdk-0.36.0.tar.gz", hash = "sha256:ba1a72c7a5c14963fdad9af1ff4c3bba26a03ddcced08111bd11a95b153249ec"},
-    {file = "jina_hubble_sdk-0.36.0-py3-none-any.whl", hash = "sha256:56db142147d7c72142ed1b6505020c3b4d8070b1603d07ff796e56d491dde294"},
+    {file = "jina-hubble-sdk-0.37.1.tar.gz", hash = "sha256:5b6bd9e13f97c8c77be822e9ae49f87a0f16ef8195011f25db2552006a5ca2a0"},
+    {file = "jina_hubble_sdk-0.37.1-py3-none-any.whl", hash = "sha256:bc54a60ed120508e231fbed28b0fff394c288312d2ffa865c7865c03dbbbb502"},
 ]
 
 [package.dependencies]
@@ -6608,6 +6620,7 @@ files = [
     {file = "pylance-0.4.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:2b86fb8dccc03094c0db37bef0d91bda60e8eb0d1eddf245c6971450c8d8a53f"},
     {file = "pylance-0.4.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bc82914b13204187d673b5f3d45f93219c38a0e9d0542ba251074f639669789"},
     {file = "pylance-0.4.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a4bcce77f99ecd4cbebbadb01e58d5d8138d40eb56bdcdbc3b20b0475e7a472"},
+    {file = "pylance-0.4.12-cp38-abi3-win_amd64.whl", hash = "sha256:9616931c5300030adb9626d22515710a127d1e46a46737a7a0f980b52f13627c"},
 ]
 
 [package.dependencies]
@@ -9367,6 +9380,18 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2
 doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
 test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
 
+[[package]]
+name = "types-chardet"
+version = "5.0.4.6"
+description = "Typing stubs for chardet"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "types-chardet-5.0.4.6.tar.gz", hash = "sha256:caf4c74cd13ccfd8b3313c314aba943b159de562a2573ed03137402b2bb37818"},
+    {file = "types_chardet-5.0.4.6-py3-none-any.whl", hash = "sha256:ea832d87e798abf1e4dfc73767807c2b7fee35d0003ae90348aea4ae00fb004d"},
+]
+
 [[package]]
 name = "types-pyopenssl"
 version = "23.1.0.3"
@@ -9762,14 +9787,14 @@ files = [
 
 [[package]]
 name = "weaviate-client"
-version = "3.18.0"
+version = "3.19.1"
 description = "A python native weaviate client"
 category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "weaviate-client-3.18.0.tar.gz", hash = "sha256:423a526518a32505c5293328e5f252e6cbbf20e4b3124733f70d10fc0d6823c9"},
-    {file = "weaviate_client-3.18.0-py3-none-any.whl", hash = "sha256:42b324286a4b4436317e5d2c6ba48c07da6cf01518efdd47ee097e7a8cc7584c"},
+    {file = "weaviate-client-3.19.1.tar.gz", hash = "sha256:8528926b0b545225ab75583481d67cccf9494d2dc01cb62f1165a8f187b41ebb"},
+    {file = "weaviate_client-3.19.1-py3-none-any.whl", hash = "sha256:6b4aae86cd955543fcc6328bc1462fbbae053dc50f7b6822287b05b98fec0d27"},
 ]
 
 [package.dependencies]
@@ -10309,14 +10334,15 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
 azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
 cohere = ["cohere"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
+extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "chardet", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm", "zep-python"]
 hnswlib = ["docarray", "hnswlib", "protobuf"]
 in-memory-store = ["docarray"]
 llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
 openai = ["openai", "tiktoken"]
 qdrant = ["qdrant-client"]
+text-helpers = ["chardet"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "055d65314e800e0731b086471d357e5fb6bbd265ee9fa2bd3762470152cc3b85"
+content-hash = "2b19b9deca7f83ca14af1f7bc7808bbe7873a91ce4c95381eaad8ea84fe04c0b"
diff --git a/pyproject.toml b/pyproject.toml
index 7c86d733..01642054 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,6 +89,8 @@ gql = {version = "^3.4.1", optional = true}
 pandas = {version = "^2.0.1", optional = true}
 telethon = {version = "^1.28.5", optional = true}
 zep-python = {version="^0.25", optional=true}
+chardet = {version="^5.1.0", optional=true}
+
 
 [tool.poetry.group.docs.dependencies]
 autodoc_pydantic = "^1.8.0"
@@ -156,6 +158,7 @@ ruff = "^0.0.249"
 types-toml = "^0.10.8.1"
 types-redis = "^4.3.21.6"
 black = "^23.1.0"
+types-chardet = "^5.0.4.6"
 
 [tool.poetry.group.typing.dependencies]
 mypy = "^0.991"
@@ -174,6 +177,7 @@ setuptools = "^67.6.1"
 llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
 qdrant = ["qdrant-client"]
 openai = ["openai", "tiktoken"]
+text_helpers = ["chardet"]
 cohere = ["cohere"]
 in_memory_store = ["docarray"]
 hnswlib = ["docarray", "protobuf", "hnswlib"]
@@ -185,6 +189,7 @@ all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "
 # merge-conflicts
 extended_testing = [
  "beautifulsoup4",
+ "chardet",
  "jq",
  "pdfminer.six",
  "pypdf",
diff --git a/tests/unit_tests/document_loaders/test_detect_encoding.py b/tests/unit_tests/document_loaders/test_detect_encoding.py
new file mode 100644
index 00000000..5cee5cd8
--- /dev/null
+++ b/tests/unit_tests/document_loaders/test_detect_encoding.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders import DirectoryLoader, TextLoader
+from langchain.document_loaders.helpers import detect_file_encodings
+
+
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding() -> None:
+    """Test text loader."""
+    path = Path(__file__).parent.parent / "examples"
+    files = path.glob("**/*.txt")
+    loader = DirectoryLoader(str(path), glob="**/*.txt", loader_cls=TextLoader)
+    loader_detect_encoding = DirectoryLoader(
+        str(path),
+        glob="**/*.txt",
+        loader_kwargs={"autodetect_encoding": True},
+        loader_cls=TextLoader,
+    )
+
+    with pytest.raises((UnicodeDecodeError, RuntimeError)):
+        loader.load()
+
+    docs = loader_detect_encoding.load()
+    assert len(docs) == len(list(files))
+
+
+@pytest.mark.skip(reason="slow test")
+@pytest.mark.requires("chardet")
+def test_loader_detect_encoding_timeout(tmpdir: str) -> None:
+    path = Path(tmpdir)
+    file_path = str(path / "blob.txt")
+    # 2mb binary blob
+    with open(file_path, "wb") as f:
+        f.write(b"\x00" * 2_000_000)
+
+    with pytest.raises(TimeoutError):
+        detect_file_encodings(file_path, timeout=1)
+
+    detect_file_encodings(file_path, timeout=10)
diff --git a/tests/unit_tests/examples/example-non-utf8.txt b/tests/unit_tests/examples/example-non-utf8.txt
new file mode 100644
index 00000000..60cbb207
--- /dev/null
+++ b/tests/unit_tests/examples/example-non-utf8.txt
@@ -0,0 +1 @@
+�����-�� ����������
diff --git a/tests/unit_tests/examples/example-utf8.txt b/tests/unit_tests/examples/example-utf8.txt
new file mode 100644
index 00000000..1bb51996
--- /dev/null
+++ b/tests/unit_tests/examples/example-utf8.txt
@@ -0,0 +1,6 @@
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
+incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
+nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
+fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
+culpa qui officia deserunt mollit anim id est laborum.