forked from Archives/langchain
Add PythonLoader which auto-detects encoding of Python files (#3311)
This PR contributes a `PythonLoader`, which inherits from `TextLoader` but detects and sets the encoding automatically.fix_agent_callbacks
parent
1ecbeec24e
commit
aa9d5707e0
@ -0,0 +1,14 @@
|
||||
import tokenize
|
||||
|
||||
from langchain.document_loaders.text import TextLoader
|
||||
|
||||
|
||||
class PythonLoader(TextLoader):
|
||||
"""
|
||||
Load Python files, respecting any non-default encoding if specified.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
with open(file_path, "rb") as f:
|
||||
encoding, _ = tokenize.detect_encoding(f.readline)
|
||||
super().__init__(file_path=file_path, encoding=encoding)
|
@ -0,0 +1,19 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders.python import PythonLoader
|
||||
|
||||
|
||||
@pytest.mark.parametrize("filename", ["default-encoding.py", "non-utf8-encoding.py"])
|
||||
def test_python_loader(filename: str) -> None:
|
||||
"""Test Python loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples" / filename
|
||||
loader = PythonLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
metadata = docs[0].metadata
|
||||
|
||||
assert metadata["source"] == str(file_path)
|
@ -0,0 +1 @@
|
||||
u = "🦜🔗"
|
@ -0,0 +1,3 @@
|
||||
# coding: iso-8859-5
|
||||
# ±¶ÿàáâãäåæçèéêëìíîï <- Cyrillic characters
|
||||
u = "®âðÄ"
|
Loading…
Reference in New Issue