mirror of
https://github.com/hwchase17/langchain
synced 2024-11-13 19:10:52 +00:00
core: JsonOutputParser UTF characters bug (#27306)
**Description:** This PR fixes an issue where non-ASCII characters in Pydantic field descriptions were being escaped to their Unicode representations when using `JsonOutputParser`. The change allows non-ASCII characters to be preserved in the output, which is especially important for multilingual support and when working with non-English languages. **Issue:** Fixes #27256 **Example Code:** ```python from pydantic import BaseModel, Field from langchain_core.output_parsers import JsonOutputParser class Article(BaseModel): title: str = Field(description="科学文章的标题") output_data_structure = Article parser = JsonOutputParser(pydantic_object=output_data_structure) print(parser.get_format_instructions()) ``` **Previous Output**: ```... "title": {"description": "\\u79d1\\u5b66\\u6587\\u7ae0\\u7684\\u6807\\u9898", "title": "Title", "type": "string"}} ...``` **Current Output**: ```... "title": {"description": "科学文章的标题", "title": "Title", "type": "string"}} ...``` **Changes made**: - Modified `json.dumps()` call in `langchain_core/output_parsers/json.py` to use `ensure_ascii=False` - Added a unit test to verify Unicode handling Co-authored-by: Harsimran-19 <harsimran1869@gmail.com>
This commit is contained in:
parent
49517cc1e7
commit
c1d8c33df6
@ -115,7 +115,7 @@ class JsonOutputParser(BaseCumulativeTransformOutputParser[Any]):
|
||||
if "type" in reduced_schema:
|
||||
del reduced_schema["type"]
|
||||
# Ensure json in context is well-formed with double quotes.
|
||||
schema_str = json.dumps(reduced_schema)
|
||||
schema_str = json.dumps(reduced_schema, ensure_ascii=False)
|
||||
return JSON_FORMAT_INSTRUCTIONS.format(schema=schema_str)
|
||||
|
||||
@property
|
||||
|
@ -3,7 +3,7 @@ from collections.abc import AsyncIterator, Iterator
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain_core.exceptions import OutputParserException
|
||||
from langchain_core.output_parsers.json import (
|
||||
@ -603,3 +603,16 @@ def test_base_model_schema_consistency() -> None:
|
||||
|
||||
assert initial_joke_schema == retrieved_joke_schema
|
||||
assert openai_func.get("name", None) is not None
|
||||
|
||||
|
||||
def test_unicode_handling() -> None:
|
||||
"""Tests if the JsonOutputParser is able to process unicodes."""
|
||||
|
||||
class Sample(BaseModel):
|
||||
title: str = Field(description="科学文章的标题")
|
||||
|
||||
parser = SimpleJsonOutputParser(pydantic_object=Sample)
|
||||
format_instructions = parser.get_format_instructions()
|
||||
assert (
|
||||
"科学文章的标题" in format_instructions
|
||||
), "Unicode characters should not be escaped"
|
||||
|
Loading…
Reference in New Issue
Block a user