core[minor]: Prevent PydanticOutputParser from encoding schema as ASCII (#25386)

This allows users to provide parameter descriptions in the pydantic models in other languages. Continuing this PR: https://github.com/langchain-ai/langchain/pull/24809
2024-11-10 01:10:59 +00:00 · 2024-08-14 09:54:31 -04:00 · 2024-08-14 09:54:31 -04:00 · dc51cc5690
commit dc51cc5690
parent 27690506d0
2 changed files with 22 additions and 1 deletions
--- a/libs/core/langchain_core/output_parsers/pydantic.py
+++ b/libs/core/langchain_core/output_parsers/pydantic.py
@ -92,7 +92,7 @@ class PydanticOutputParser(JsonOutputParser, Generic[TBaseModel]):
        if "type" in reduced_schema:
            del reduced_schema["type"]
        # Ensure json in context is well-formed with double quotes.
-        schema_str = json.dumps(reduced_schema)
+        schema_str = json.dumps(reduced_schema, ensure_ascii=False)

        return _PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str)

--- a/libs/langchain/tests/unit_tests/output_parsers/test_pydantic_parser.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_pydantic_parser.py
@ -100,3 +100,24 @@ def test_pydantic_output_parser_type_inference() -> None:
        "title": "SampleModel",
        "type": "object",
    }
+
+
+def test_format_instructions_preserves_language() -> None:
+    """Test format instructions does not attempt to encode into ascii."""
+    from langchain_core.pydantic_v1 import BaseModel, Field
+
+    description = (
+        "你好, こんにちは, नमस्ते, Bonjour, Hola, "
+        "Olá, 안녕하세요, Jambo, Merhaba, Γειά σου"
+    )
+
+    class Foo(BaseModel):
+        hello: str = Field(
+            description=(
+                "你好, こんにちは, नमस्ते, Bonjour, Hola, "
+                "Olá, 안녕하세요, Jambo, Merhaba, Γειά σου"
+            )
+        )
+
+    parser = PydanticOutputParser(pydantic_object=Foo)  # type: ignore
+    assert description in parser.get_format_instructions()