From c1d8c33df6fd359b29399033cf1d70305a92c611 Mon Sep 17 00:00:00 2001 From: Harsimran-19 <103135191+Harsimran-19@users.noreply.github.com> Date: Tue, 29 Oct 2024 20:18:53 +0530 Subject: [PATCH] core: JsonOutputParser UTF characters bug (#27306) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description:** This PR fixes an issue where non-ASCII characters in Pydantic field descriptions were being escaped to their Unicode representations when using `JsonOutputParser`. The change allows non-ASCII characters to be preserved in the output, which is especially important for multilingual support and when working with non-English languages. **Issue:** Fixes #27256 **Example Code:** ```python from pydantic import BaseModel, Field from langchain_core.output_parsers import JsonOutputParser class Article(BaseModel): title: str = Field(description="科学文章的标题") output_data_structure = Article parser = JsonOutputParser(pydantic_object=output_data_structure) print(parser.get_format_instructions()) ``` **Previous Output**: ```... "title": {"description": "\\u79d1\\u5b66\\u6587\\u7ae0\\u7684\\u6807\\u9898", "title": "Title", "type": "string"}} ...``` **Current Output**: ```... "title": {"description": "科学文章的标题", "title": "Title", "type": "string"}} ...``` **Changes made**: - Modified `json.dumps()` call in `langchain_core/output_parsers/json.py` to use `ensure_ascii=False` - Added a unit test to verify Unicode handling Co-authored-by: Harsimran-19 --- libs/core/langchain_core/output_parsers/json.py | 2 +- .../tests/unit_tests/output_parsers/test_json.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/libs/core/langchain_core/output_parsers/json.py b/libs/core/langchain_core/output_parsers/json.py index e9d3669e44..18c1257a9a 100644 --- a/libs/core/langchain_core/output_parsers/json.py +++ b/libs/core/langchain_core/output_parsers/json.py @@ -115,7 +115,7 @@ class JsonOutputParser(BaseCumulativeTransformOutputParser[Any]): if "type" in reduced_schema: del reduced_schema["type"] # Ensure json in context is well-formed with double quotes. - schema_str = json.dumps(reduced_schema) + schema_str = json.dumps(reduced_schema, ensure_ascii=False) return JSON_FORMAT_INSTRUCTIONS.format(schema=schema_str) @property diff --git a/libs/core/tests/unit_tests/output_parsers/test_json.py b/libs/core/tests/unit_tests/output_parsers/test_json.py index 96cf6d0cc4..326cfc16cd 100644 --- a/libs/core/tests/unit_tests/output_parsers/test_json.py +++ b/libs/core/tests/unit_tests/output_parsers/test_json.py @@ -3,7 +3,7 @@ from collections.abc import AsyncIterator, Iterator from typing import Any import pytest -from pydantic import BaseModel +from pydantic import BaseModel, Field from langchain_core.exceptions import OutputParserException from langchain_core.output_parsers.json import ( @@ -603,3 +603,16 @@ def test_base_model_schema_consistency() -> None: assert initial_joke_schema == retrieved_joke_schema assert openai_func.get("name", None) is not None + + +def test_unicode_handling() -> None: + """Tests if the JsonOutputParser is able to process unicodes.""" + + class Sample(BaseModel): + title: str = Field(description="科学文章的标题") + + parser = SimpleJsonOutputParser(pydantic_object=Sample) + format_instructions = parser.get_format_instructions() + assert ( + "科学文章的标题" in format_instructions + ), "Unicode characters should not be escaped"