Add PythonLoader which auto-detects encoding of Python files (#3311)

This PR contributes a `PythonLoader`, which inherits from `TextLoader` but detects and sets the encoding automatically.
1 year ago · aa9d5707e0
parent 1ecbeec24e
commit aa9d5707e0
7 changed files with 104 additions and 3 deletions
--- a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb
@ -11,7 +11,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "id": "019d8520",
   "metadata": {},
   "outputs": [],
@ -128,10 +128,69 @@
    "len(docs)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "598a2805",
+   "metadata": {},
+   "source": [
+    "If you need to load Python source code files, use the `PythonLoader`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c558bd73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import PythonLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "a3cfaba7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DirectoryLoader('../../../../../', glob=\"**/*.py\", loader_cls=PythonLoader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "e2e1e26a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "ffb8ff36",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "691"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(docs)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "984c8429",
+   "id": "7f6e0eae",
   "metadata": {},
   "outputs": [],
   "source": []
@ -153,7 +212,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.10.3"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import (
    UnstructuredPDFLoader,
 )
 from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
+from langchain.document_loaders.python import PythonLoader
 from langchain.document_loaders.readthedocs import ReadTheDocsLoader
 from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.rtf import UnstructuredRTFLoader
@ -156,4 +157,5 @@ __all__ = [
    "ImageCaptionLoader",
    "DiscordChatLoader",
    "ConfluenceLoader",
+    "PythonLoader",
 ]
--- a/langchain/document_loaders/python.py
+++ b/langchain/document_loaders/python.py
@ -0,0 +1,14 @@
+import tokenize
+
+from langchain.document_loaders.text import TextLoader
+
+
+class PythonLoader(TextLoader):
+    """
+    Load Python files, respecting any non-default encoding if specified.
+    """
+
+    def __init__(self, file_path: str):
+        with open(file_path, "rb") as f:
+            encoding, _ = tokenize.detect_encoding(f.readline)
+        super().__init__(file_path=file_path, encoding=encoding)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -148,6 +148,9 @@ select = [
  "F",  # pyflakes
  "I",  # isort
 ]
+exclude = [
+  "tests/integration_tests/examples/non-utf8-encoding.py",
+]

 [tool.mypy]
 ignore_missing_imports = "True"
--- a/tests/integration_tests/document_loaders/test_python.py
+++ b/tests/integration_tests/document_loaders/test_python.py
@ -0,0 +1,19 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.python import PythonLoader
+
+
+@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"])
+def test_python_loader(filename: str) -> None:
+    """Test Python loader."""
+    file_path = Path(__file__).parent.parent / "examples" / filename
+    loader = PythonLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+
+    assert metadata["source"] == str(file_path)
--- a/tests/integration_tests/examples/default-encoding.py
+++ b/tests/integration_tests/examples/default-encoding.py
@ -0,0 +1 @@
+u = "🦜🔗"
--- a/tests/integration_tests/examples/non-utf8-encoding.py
+++ b/tests/integration_tests/examples/non-utf8-encoding.py
@ -0,0 +1,3 @@
+# coding: iso-8859-5
+# ±¶ÿàáâãäåæçèéêëìíîï <- Cyrillic characters
+u = "®âðÄ"