From aa9d5707e07b40fd3e1d0e10d57ec4196cd79c34 Mon Sep 17 00:00:00 2001 From: Paul Garner Date: Fri, 21 Apr 2023 18:47:57 +0100 Subject: [PATCH] Add PythonLoader which auto-detects encoding of Python files (#3311) This PR contributes a `PythonLoader`, which inherits from `TextLoader` but detects and sets the encoding automatically. --- .../examples/directory_loader.ipynb | 65 ++++++++++++++++++- langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/python.py | 14 ++++ pyproject.toml | 3 + .../document_loaders/test_python.py | 19 ++++++ .../examples/default-encoding.py | 1 + .../examples/non-utf8-encoding.py | 3 + 7 files changed, 104 insertions(+), 3 deletions(-) create mode 100644 langchain/document_loaders/python.py create mode 100644 tests/integration_tests/document_loaders/test_python.py create mode 100644 tests/integration_tests/examples/default-encoding.py create mode 100644 tests/integration_tests/examples/non-utf8-encoding.py diff --git a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb index 40653655..7a9b4e6f 100644 --- a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb +++ b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "019d8520", "metadata": {}, "outputs": [], @@ -128,10 +128,69 @@ "len(docs)" ] }, + { + "cell_type": "markdown", + "id": "598a2805", + "metadata": {}, + "source": [ + "If you need to load Python source code files, use the `PythonLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c558bd73", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PythonLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a3cfaba7", + "metadata": {}, + "outputs": [], + "source": [ + "loader = DirectoryLoader('../../../../../', glob=\"**/*.py\", loader_cls=PythonLoader)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e2e1e26a", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ffb8ff36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "691" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "984c8429", + "id": "7f6e0eae", "metadata": {}, "outputs": [], "source": [] @@ -153,7 +212,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.3" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index c4cc7448..3d0c4295 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import ( UnstructuredPDFLoader, ) from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader +from langchain.document_loaders.python import PythonLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.rtf import UnstructuredRTFLoader @@ -156,4 +157,5 @@ __all__ = [ "ImageCaptionLoader", "DiscordChatLoader", "ConfluenceLoader", + "PythonLoader", ] diff --git a/langchain/document_loaders/python.py b/langchain/document_loaders/python.py new file mode 100644 index 00000000..65487323 --- /dev/null +++ b/langchain/document_loaders/python.py @@ -0,0 +1,14 @@ +import tokenize + +from langchain.document_loaders.text import TextLoader + + +class PythonLoader(TextLoader): + """ + Load Python files, respecting any non-default encoding if specified. + """ + + def __init__(self, file_path: str): + with open(file_path, "rb") as f: + encoding, _ = tokenize.detect_encoding(f.readline) + super().__init__(file_path=file_path, encoding=encoding) diff --git a/pyproject.toml b/pyproject.toml index 03bd5b8b..33944c9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -148,6 +148,9 @@ select = [ "F", # pyflakes "I", # isort ] +exclude = [ + "tests/integration_tests/examples/non-utf8-encoding.py", +] [tool.mypy] ignore_missing_imports = "True" diff --git a/tests/integration_tests/document_loaders/test_python.py b/tests/integration_tests/document_loaders/test_python.py new file mode 100644 index 00000000..f4b2b3ae --- /dev/null +++ b/tests/integration_tests/document_loaders/test_python.py @@ -0,0 +1,19 @@ +from pathlib import Path + +import pytest + +from langchain.document_loaders.python import PythonLoader + + +@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"]) +def test_python_loader(filename: str) -> None: + """Test Python loader.""" + file_path = Path(__file__).parent.parent / "examples" / filename + loader = PythonLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + + metadata = docs[0].metadata + + assert metadata["source"] == str(file_path) diff --git a/tests/integration_tests/examples/default-encoding.py b/tests/integration_tests/examples/default-encoding.py new file mode 100644 index 00000000..9a09cc82 --- /dev/null +++ b/tests/integration_tests/examples/default-encoding.py @@ -0,0 +1 @@ +u = "🦜🔗" diff --git a/tests/integration_tests/examples/non-utf8-encoding.py b/tests/integration_tests/examples/non-utf8-encoding.py new file mode 100644 index 00000000..e00f46c5 --- /dev/null +++ b/tests/integration_tests/examples/non-utf8-encoding.py @@ -0,0 +1,3 @@ +# coding: iso-8859-5 +# ±¶ÿàáâãäåæçèéêëìíîï <- Cyrillic characters +u = "®âðÄ"