feat: Yaml output parser (#14496)

## Description
New YAML output parser as a drop-in replacement for the Pydantic output
parser. Yaml is a much more token-efficient format than JSON, proving to
be **~35% faster and using the same percentage fewer completion
tokens**.

☑️ Formatted
☑️ Linted
☑️ Tested (analogous to the existing`test_pydantic_parser.py`)

The YAML parser excels in situations where a list of objects is
required, where the root object needs no key:
```python
class Products(BaseModel):
   __root__: list[Product]
```

I ran the prompt `Generate 10 healthy, organic products` 10 times on one
chain using the `PydanticOutputParser`, the other one using
the`YamlOutputParser` with `Products` (see below) being the targeted
model to be created.

LLMs used were Fireworks' `lama-v2-34b-code-instruct` and OpenAI
`gpt-3.5-turbo`. All runs succeeded without validation errors.

```python
class Nutrition(BaseModel):
    sugar: int = Field(description="Sugar in grams")
    fat: float = Field(description="% of daily fat intake")

class Product(BaseModel):
    name: str = Field(description="Product name")
    stats: Nutrition

class Products(BaseModel):
    """A list of products"""

    products: list[Product] # Used `__root__` for the yaml chain
```
Stats after 10 runs reach were as follows:
### JSON
ø time: 7.75s
ø tokens: 380.8

### YAML
ø time: 5.12s
ø tokens: 242.2


Looking forward to feedback, tips and contributions!
pull/14158/head^2
Thomas B 6 months ago committed by GitHub
parent d31ff30df6
commit b4e3e47c92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -36,6 +36,7 @@ from langchain.output_parsers.regex_dict import RegexDictParser
from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser
from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser
from langchain.output_parsers.xml import XMLOutputParser
from langchain.output_parsers.yaml import YamlOutputParser
__all__ = [
"BooleanOutputParser",
@ -60,4 +61,5 @@ __all__ = [
"JsonOutputToolsParser",
"PydanticToolsParser",
"JsonOutputKeyToolsParser",
"YamlOutputParser",
]

@ -26,6 +26,26 @@ Here is the output schema:
{schema}
```"""
YAML_FORMAT_INSTRUCTIONS = """The output should be formatted as a YAML instance that conforms to the given JSON schema below.
As an example, for the schema
```
{{'title': 'Players', 'description': 'A list of players', 'type': 'array', 'items': {{'$ref': '#/definitions/Player'}}, 'definitions': {{'Player': {{'title': 'Player', 'type': 'object', 'properties': {{'name': {{'title': 'Name', 'description': 'Player name', 'type': 'string'}}, 'avg': {{'title': 'Avg', 'description': 'Batting average', 'type': 'number'}}}}, 'required': ['name', 'avg']}}}}}}
```
a well formatted instance would be:
```
- name: John Doe
avg: 0.3
- name: Jane Maxfield
avg: 1.4
```
Please follow the standard YAML formatting conventions with an indent of 2 spaces and make sure that the data types adhere strictly to the following JSON schema:
```
{schema}
```
Make sure to always enclose the YAML output in triple backticks (```)"""
XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
1. Output should conform to the tags below.

@ -0,0 +1,58 @@
import json
import re
from typing import Type, TypeVar
import yaml
from langchain_core.exceptions import OutputParserException
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.pydantic_v1 import BaseModel, ValidationError
from langchain.output_parsers.format_instructions import YAML_FORMAT_INSTRUCTIONS
T = TypeVar("T", bound=BaseModel)
class YamlOutputParser(BaseOutputParser[T]):
"""Parse YAML output using a pydantic model."""
pydantic_object: Type[T]
"""The pydantic model to parse."""
pattern: re.Pattern = re.compile(
r"^```(?:ya?ml)?(?P<yaml>[^`]*)", re.MULTILINE | re.DOTALL
)
"""Regex pattern to match yaml code blocks
within triple backticks with optional yaml or yml prefix."""
def parse(self, text: str) -> T:
try:
# Greedy search for 1st yaml candidate.
match = re.search(self.pattern, text.strip())
yaml_str = ""
if match:
yaml_str = match.group("yaml")
json_object = yaml.safe_load(yaml_str)
return self.pydantic_object.parse_obj(json_object)
except (yaml.YAMLError, ValidationError) as e:
name = self.pydantic_object.__name__
msg = f"Failed to parse {name} from completion {text}. Got: {e}"
raise OutputParserException(msg, llm_output=text)
def get_format_instructions(self) -> str:
schema = self.pydantic_object.schema()
# Remove extraneous fields.
reduced_schema = schema
if "title" in reduced_schema:
del reduced_schema["title"]
if "type" in reduced_schema:
del reduced_schema["type"]
# Ensure yaml in context is well-formed with double quotes.
schema_str = json.dumps(reduced_schema)
return YAML_FORMAT_INSTRUCTIONS.format(schema=schema_str)
@property
def _type(self) -> str:
return "yaml"

@ -23,6 +23,7 @@ EXPECTED_ALL = [
"JsonOutputToolsParser",
"PydanticToolsParser",
"JsonOutputKeyToolsParser",
"YamlOutputParser",
]

@ -0,0 +1,83 @@
"""Test yamlOutputParser"""
from enum import Enum
from typing import Optional
from langchain_core.exceptions import OutputParserException
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.yaml import YamlOutputParser
class Actions(Enum):
SEARCH = "Search"
CREATE = "Create"
UPDATE = "Update"
DELETE = "Delete"
class TestModel(BaseModel):
action: Actions = Field(description="Action to be performed")
action_input: str = Field(description="Input to be used in the action")
additional_fields: Optional[str] = Field(
description="Additional fields", default=None
)
for_new_lines: str = Field(description="To be used to test newlines")
# Prevent pytest from trying to run tests on TestModel
TestModel.__test__ = False # type: ignore[attr-defined]
DEF_RESULT = """```yaml
---
action: Update
action_input: The yamlOutputParser class is powerful
additional_fields: null
for_new_lines: |
not_escape_newline:
escape_newline:
```"""
# action 'update' with a lowercase 'u' to test schema validation failure.
DEF_RESULT_FAIL = """```yaml
action: update
action_input: The yamlOutputParser class is powerful
additional_fields: null
```"""
DEF_EXPECTED_RESULT = TestModel(
action=Actions.UPDATE,
action_input="The yamlOutputParser class is powerful",
additional_fields=None,
for_new_lines="not_escape_newline:\n escape_newline: \n",
)
def test_yaml_output_parser() -> None:
"""Test yamlOutputParser."""
yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
pydantic_object=TestModel
)
result = yaml_parser.parse(DEF_RESULT)
print("parse_result:", result)
assert DEF_EXPECTED_RESULT == result
def test_yaml_output_parser_fail() -> None:
"""Test YamlOutputParser where completion result fails schema validation."""
yaml_parser: YamlOutputParser[TestModel] = YamlOutputParser(
pydantic_object=TestModel
)
try:
yaml_parser.parse(DEF_RESULT_FAIL)
except OutputParserException as e:
print("parse_result:", e)
assert "Failed to parse TestModel from completion" in str(e)
else:
assert False, "Expected OutputParserException"
Loading…
Cancel
Save