Add PythonLoader which auto-detects encoding of Python files (#3311)

This PR contributes a `PythonLoader`, which inherits from
`TextLoader` but detects and sets the encoding automatically.
fix_agent_callbacks
Paul Garner 1 year ago committed by GitHub
parent 1ecbeec24e
commit aa9d5707e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "019d8520",
"metadata": {},
"outputs": [],
@ -128,10 +128,69 @@
"len(docs)"
]
},
{
"cell_type": "markdown",
"id": "598a2805",
"metadata": {},
"source": [
"If you need to load Python source code files, use the `PythonLoader`."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c558bd73",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PythonLoader"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a3cfaba7",
"metadata": {},
"outputs": [],
"source": [
"loader = DirectoryLoader('../../../../../', glob=\"**/*.py\", loader_cls=PythonLoader)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e2e1e26a",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ffb8ff36",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"691"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "984c8429",
"id": "7f6e0eae",
"metadata": {},
"outputs": [],
"source": []
@ -153,7 +212,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.10.3"
}
},
"nbformat": 4,

@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import (
UnstructuredPDFLoader,
)
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.python import PythonLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader
@ -156,4 +157,5 @@ __all__ = [
"ImageCaptionLoader",
"DiscordChatLoader",
"ConfluenceLoader",
"PythonLoader",
]

@ -0,0 +1,14 @@
import tokenize
from langchain.document_loaders.text import TextLoader
class PythonLoader(TextLoader):
"""
Load Python files, respecting any non-default encoding if specified.
"""
def __init__(self, file_path: str):
with open(file_path, "rb") as f:
encoding, _ = tokenize.detect_encoding(f.readline)
super().__init__(file_path=file_path, encoding=encoding)

@ -148,6 +148,9 @@ select = [
"F", # pyflakes
"I", # isort
]
exclude = [
"tests/integration_tests/examples/non-utf8-encoding.py",
]
[tool.mypy]
ignore_missing_imports = "True"

@ -0,0 +1,19 @@
from pathlib import Path
import pytest
from langchain.document_loaders.python import PythonLoader
@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"])
def test_python_loader(filename: str) -> None:
"""Test Python loader."""
file_path = Path(__file__).parent.parent / "examples" / filename
loader = PythonLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
assert metadata["source"] == str(file_path)

@ -0,0 +1,3 @@
# coding: iso-8859-5
# ±¶ÿàáâãäåæçèéêëìíîï <- Cyrillic characters
u = "®âðÄ"
Loading…
Cancel
Save