core: JsonOutputParser UTF characters bug (#27306)

**Description:**
This PR fixes an issue where non-ASCII characters in Pydantic field
descriptions were being escaped to their Unicode representations when
using `JsonOutputParser`. The change allows non-ASCII characters to be
preserved in the output, which is especially important for multilingual
support and when working with non-English languages.

**Issue:** Fixes #27256

**Example Code:**
```python
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class Article(BaseModel):
    title: str = Field(description="科学文章的标题")

output_data_structure = Article
parser = JsonOutputParser(pydantic_object=output_data_structure)
print(parser.get_format_instructions())
```
**Previous Output**:
```... "title": {"description": "\\u79d1\\u5b66\\u6587\\u7ae0\\u7684\\u6807\\u9898", "title": "Title", "type": "string"}} ...```

**Current Output**:
```... "title": {"description": "科学文章的标题", "title": "Title", "type":
"string"}} ...```

**Changes made**:
- Modified `json.dumps()` call in
`langchain_core/output_parsers/json.py` to use `ensure_ascii=False`
- Added a unit test to verify Unicode handling

Co-authored-by: Harsimran-19 <harsimran1869@gmail.com>
This commit is contained in:
Harsimran-19 2024-10-29 20:18:53 +05:30 committed by GitHub
parent 49517cc1e7
commit c1d8c33df6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 15 additions and 2 deletions

View File

@ -115,7 +115,7 @@ class JsonOutputParser(BaseCumulativeTransformOutputParser[Any]):
if "type" in reduced_schema:
del reduced_schema["type"]
# Ensure json in context is well-formed with double quotes.
schema_str = json.dumps(reduced_schema)
schema_str = json.dumps(reduced_schema, ensure_ascii=False)
return JSON_FORMAT_INSTRUCTIONS.format(schema=schema_str)
@property

View File

@ -3,7 +3,7 @@ from collections.abc import AsyncIterator, Iterator
from typing import Any
import pytest
from pydantic import BaseModel
from pydantic import BaseModel, Field
from langchain_core.exceptions import OutputParserException
from langchain_core.output_parsers.json import (
@ -603,3 +603,16 @@ def test_base_model_schema_consistency() -> None:
assert initial_joke_schema == retrieved_joke_schema
assert openai_func.get("name", None) is not None
def test_unicode_handling() -> None:
"""Tests if the JsonOutputParser is able to process unicodes."""
class Sample(BaseModel):
title: str = Field(description="科学文章的标题")
parser = SimpleJsonOutputParser(pydantic_object=Sample)
format_instructions = parser.get_format_instructions()
assert (
"科学文章的标题" in format_instructions
), "Unicode characters should not be escaped"