Make json output parser handle newlines inside markdown code blocks (#8682)

Update to #8528

Newlines and other special characters within markdown code blocks
returned as `action_input` should be handled correctly (in particular,
unescaped `"` => `\"` and `\n` => `\\n`) so they don't break JSON
parsing.

@baskaryan
pull/8900/head
Bruno Bornsztein 1 year ago committed by GitHub
parent ce3666c28b
commit d56eff042a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8,6 +8,36 @@ from typing import Any, List
from langchain.schema import BaseOutputParser, OutputParserException
def _replace_new_line(match: re.Match[str]) -> str:
value = match.group(2)
value = re.sub(r"\n", r"\\n", value)
value = re.sub(r"\r", r"\\r", value)
value = re.sub(r"\t", r"\\t", value)
value = re.sub('"', r"\"", value)
return match.group(1) + value + match.group(3)
def _custom_parser(multiline_string: str) -> str:
"""
The LLM response for `action_input` may be a multiline
string containing unescaped newlines, tabs or quotes. This function
replaces those characters with their escaped counterparts.
(newlines in JSON must be double-escaped: `\\n`)
"""
if isinstance(multiline_string, (bytes, bytearray)):
multiline_string = multiline_string.decode()
multiline_string = re.sub(
r'("action_input"\:\s*")(.*)(")',
_replace_new_line,
multiline_string,
flags=re.DOTALL,
)
return multiline_string
def parse_json_markdown(json_string: str) -> dict:
"""
Parse a JSON string from a Markdown string.
@ -31,6 +61,9 @@ def parse_json_markdown(json_string: str) -> dict:
# Strip whitespace and newlines from the start and end
json_str = json_str.strip()
# handle newlines and other special characters inside the returned value
json_str = _custom_parser(json_str)
# Parse the JSON string into a Python dictionary
parsed = json.loads(json_str)

@ -60,6 +60,13 @@ JSON_WITH_MARKDOWN_CODE_BLOCK = """```json
}
```"""
JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json
{
"action": "Final Answer",
"action_input": "```bar\n<div id="1" class=\"value\">\n\ttext\n</div>```"
}
```"""
NO_TICKS = """{
"foo": "bar"
}"""
@ -114,6 +121,13 @@ def test_parse_json(json_string: str) -> None:
assert parsed == {"foo": "bar"}
def test_parse_json_with_code_block() -> None:
def test_parse_json_with_code_blocks() -> None:
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
assert parsed == {"foo": "```bar```"}
parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)
assert parsed == {
"action": "Final Answer",
"action_input": '```bar\n<div id="1" class="value">\n\ttext\n</div>```',
}

Loading…
Cancel
Save