From 5a490a79f42bd4040d1f2ce24734e37f4deb31a6 Mon Sep 17 00:00:00 2001 From: Bruno Bornsztein Date: Mon, 31 Jul 2023 18:36:57 -0500 Subject: [PATCH] fix issue #8357 by making json backtick regex greedy (#8528) - Description: Markdown code blocks in json response should not break the parser - Issue: #8357 @baskaryan @hinthornw --- libs/langchain/langchain/output_parsers/json.py | 2 +- .../tests/unit_tests/output_parsers/test_json.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/output_parsers/json.py b/libs/langchain/langchain/output_parsers/json.py index 035d8d4898..9e0c83e56b 100644 --- a/libs/langchain/langchain/output_parsers/json.py +++ b/libs/langchain/langchain/output_parsers/json.py @@ -19,7 +19,7 @@ def parse_json_markdown(json_string: str) -> dict: The parsed JSON object as a Python dictionary. """ # Try to find JSON string within triple backticks - match = re.search(r"```(json)?(.*?)```", json_string, re.DOTALL) + match = re.search(r"```(json)?(.*)```", json_string, re.DOTALL) # If no match found, assume the entire string is a JSON string if match is None: diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_json.py b/libs/langchain/tests/unit_tests/output_parsers/test_json.py index 4f83bc47ab..1762024ae1 100644 --- a/libs/langchain/tests/unit_tests/output_parsers/test_json.py +++ b/libs/langchain/tests/unit_tests/output_parsers/test_json.py @@ -54,6 +54,12 @@ TICKS_WITH_NEW_LINES_EVERYWHERE = """ """ +JSON_WITH_MARKDOWN_CODE_BLOCK = """```json +{ + "foo": "```bar```" +} +```""" + NO_TICKS = """{ "foo": "bar" }""" @@ -106,3 +112,8 @@ TEST_CASES = [ def test_parse_json(json_string: str) -> None: parsed = parse_json_markdown(json_string) assert parsed == {"foo": "bar"} + + +def test_parse_json_with_code_block() -> None: + parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK) + assert parsed == {"foo": "```bar```"}