From aa9d5707e07b40fd3e1d0e10d57ec4196cd79c34 Mon Sep 17 00:00:00 2001
From: Paul Garner <ego@anentropic.com>
Date: Fri, 21 Apr 2023 18:47:57 +0100
Subject: [PATCH] Add PythonLoader which auto-detects encoding of Python files
 (#3311)

This PR contributes a `PythonLoader`, which inherits from
`TextLoader` but detects and sets the encoding automatically.
---
 .../examples/directory_loader.ipynb           | 65 ++++++++++++++++++-
 langchain/document_loaders/__init__.py        |  2 +
 langchain/document_loaders/python.py          | 14 ++++
 pyproject.toml                                |  3 +
 .../document_loaders/test_python.py           | 19 ++++++
 .../examples/default-encoding.py              |  1 +
 .../examples/non-utf8-encoding.py             |  3 +
 7 files changed, 104 insertions(+), 3 deletions(-)
 create mode 100644 langchain/document_loaders/python.py
 create mode 100644 tests/integration_tests/document_loaders/test_python.py
 create mode 100644 tests/integration_tests/examples/default-encoding.py
 create mode 100644 tests/integration_tests/examples/non-utf8-encoding.py

diff --git a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb
index 40653655..7a9b4e6f 100644
--- a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "id": "019d8520",
    "metadata": {},
    "outputs": [],
@@ -128,10 +128,69 @@
     "len(docs)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "598a2805",
+   "metadata": {},
+   "source": [
+    "If you need to load Python source code files, use the `PythonLoader`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c558bd73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import PythonLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "a3cfaba7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DirectoryLoader('../../../../../', glob=\"**/*.py\", loader_cls=PythonLoader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "e2e1e26a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "ffb8ff36",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "691"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(docs)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "984c8429",
+   "id": "7f6e0eae",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -153,7 +212,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.10.3"
   }
  },
  "nbformat": 4,
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index c4cc7448..3d0c4295 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import (
     UnstructuredPDFLoader,
 )
 from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
+from langchain.document_loaders.python import PythonLoader
 from langchain.document_loaders.readthedocs import ReadTheDocsLoader
 from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.rtf import UnstructuredRTFLoader
@@ -156,4 +157,5 @@ __all__ = [
     "ImageCaptionLoader",
     "DiscordChatLoader",
     "ConfluenceLoader",
+    "PythonLoader",
 ]
diff --git a/langchain/document_loaders/python.py b/langchain/document_loaders/python.py
new file mode 100644
index 00000000..65487323
--- /dev/null
+++ b/langchain/document_loaders/python.py
@@ -0,0 +1,14 @@
+import tokenize
+
+from langchain.document_loaders.text import TextLoader
+
+
+class PythonLoader(TextLoader):
+    """
+    Load Python files, respecting any non-default encoding if specified.
+    """
+
+    def __init__(self, file_path: str):
+        with open(file_path, "rb") as f:
+            encoding, _ = tokenize.detect_encoding(f.readline)
+        super().__init__(file_path=file_path, encoding=encoding)
diff --git a/pyproject.toml b/pyproject.toml
index 03bd5b8b..33944c9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -148,6 +148,9 @@ select = [
   "F",  # pyflakes
   "I",  # isort
 ]
+exclude = [
+  "tests/integration_tests/examples/non-utf8-encoding.py",
+]
 
 [tool.mypy]
 ignore_missing_imports = "True"
diff --git a/tests/integration_tests/document_loaders/test_python.py b/tests/integration_tests/document_loaders/test_python.py
new file mode 100644
index 00000000..f4b2b3ae
--- /dev/null
+++ b/tests/integration_tests/document_loaders/test_python.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.python import PythonLoader
+
+
+@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"])
+def test_python_loader(filename: str) -> None:
+    """Test Python loader."""
+    file_path = Path(__file__).parent.parent / "examples" / filename
+    loader = PythonLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+
+    assert metadata["source"] == str(file_path)
diff --git a/tests/integration_tests/examples/default-encoding.py b/tests/integration_tests/examples/default-encoding.py
new file mode 100644
index 00000000..9a09cc82
--- /dev/null
+++ b/tests/integration_tests/examples/default-encoding.py
@@ -0,0 +1 @@
+u = "ðŸ¦œðŸ”—"
diff --git a/tests/integration_tests/examples/non-utf8-encoding.py b/tests/integration_tests/examples/non-utf8-encoding.py
new file mode 100644
index 00000000..e00f46c5
--- /dev/null
+++ b/tests/integration_tests/examples/non-utf8-encoding.py
@@ -0,0 +1,3 @@
+# coding: iso-8859-5
+# ±¶ÿàáâãäåæçèéêëìíîï <- Cyrillic characters
+u = "®âðÄ"