pydantic/json parsing (#1722)

```
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

joke_query = "Tell me a joke."

# Or, an example with compound type fields.
#class FloatArray(BaseModel):
#    values: List[float] = Field(description="list of floats")
#
#float_array_query = "Write out a few terms of fiboacci."

model = OpenAI(model_name='text-davinci-003', temperature=0.0)
parser = PydanticOutputParser(pydantic_object=Joke)
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

_input = prompt.format_prompt(query=joke_query)
print("Prompt:\n", _input.to_string())
output = model(_input.to_string())
print("Completion:\n", output)
parsed_output = parser.parse(output)
print("Parsed completion:\n", parsed_output)
```

```
Prompt:
 Answer the user query.
The output should be formatted as a JSON instance that conforms to the JSON schema below.  For example, the object {"foo":  ["bar", "baz"]} conforms to the schema {"foo": {"description": "a list of strings field", "type": "string"}}.

Here is the output schema:
---
{"setup": {"description": "question to set up a joke", "type": "string"}, "punchline": {"description": "answer to resolve the joke", "type": "string"}}
---

Tell me a joke.

Completion:
 {"setup": "Why don't scientists trust atoms?", "punchline": "Because they make up everything!"}

Parsed completion:
 setup="Why don't scientists trust atoms?" punchline='Because they make up everything!'
```

Ofc, works only with LMs of sufficient capacity. DaVinci is reliable but
not always.

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
jerwelborn 2023-03-16 21:43:11 -07:00 committed by GitHub
parent d6bbf395af
commit 55efbb8a7e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 229 additions and 49 deletions

View File

@ -17,30 +17,10 @@
"Below we go over some examples of output parsers." "Below we go over some examples of output parsers."
] ]
}, },
{
"cell_type": "markdown",
"id": "91871002",
"metadata": {},
"source": [
"## Structured Output Parser\n",
"\n",
"This output parser can be used when you want to return multiple fields."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 1,
"id": "b492997a", "id": "5f0c8a33",
"metadata": {},
"outputs": [],
"source": [
"from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ffb7fc57",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -49,6 +29,165 @@
"from langchain.chat_models import ChatOpenAI" "from langchain.chat_models import ChatOpenAI"
] ]
}, },
{
"cell_type": "markdown",
"id": "a1ae632a",
"metadata": {},
"source": [
"## PydanticOutputParser\n",
"This output parser allows users to specify an arbitrary JSON schema and query LLMs for JSON outputs that conform to that schema.\n",
"\n",
"Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed JSON. In the OpenAI family, DaVinci can do reliably but Curie's ability already drops off dramatically. \n",
"\n",
"Use Pydantic to declare your data model. Pydantic's BaseModel like a Python dataclass, but with actual type checking + coercion."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cba6d8e3",
"metadata": {},
"outputs": [],
"source": [
"from langchain.output_parsers import PydanticOutputParser\n",
"from pydantic import BaseModel, Field, validator\n",
"from typing import List"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0a203100",
"metadata": {},
"outputs": [],
"source": [
"model_name = 'text-davinci-003'\n",
"temperature = 0.0\n",
"model = OpenAI(model_name=model_name, temperature=temperature)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b3f16168",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Joke(setup='Why did the chicken cross the playground?', punchline='To get to the other slide!')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define your desired data structure.\n",
"class Joke(BaseModel):\n",
" setup: str = Field(description=\"question to set up a joke\")\n",
" punchline: str = Field(description=\"answer to resolve the joke\")\n",
" \n",
" # You can add custom validation logic easily with Pydantic.\n",
" @validator('setup')\n",
" def question_ends_with_question_mark(cls, field):\n",
" if field[-1] != '?':\n",
" raise ValueError(\"Badly formed question!\")\n",
" return field\n",
"\n",
"# And a query intented to prompt a language model to populate the data structure.\n",
"joke_query = \"Tell me a joke.\"\n",
"\n",
"# Set up a parser + inject instructions into the prompt template.\n",
"parser = PydanticOutputParser(pydantic_object=Joke)\n",
"\n",
"prompt = PromptTemplate(\n",
" template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
" input_variables=[\"query\"],\n",
" partial_variables={\"format_instructions\": parser.get_format_instructions()}\n",
")\n",
"\n",
"_input = prompt.format_prompt(query=joke_query)\n",
"\n",
"output = model(_input.to_string())\n",
"\n",
"parser.parse(output)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "03049f88",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Actor(name='Tom Hanks', film_names=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Cast Away', 'Toy Story', 'A League of Their Own'])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Here's another example, but with a compound typed field.\n",
"class Actor(BaseModel):\n",
" name: str = Field(description=\"name of an actor\")\n",
" film_names: List[str] = Field(description=\"list of names of films they starred in\")\n",
" \n",
"actor_query = \"Generate the filmography for a random actor.\"\n",
"\n",
"parser = PydanticOutputParser(pydantic_object=Actor)\n",
"\n",
"prompt = PromptTemplate(\n",
" template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
" input_variables=[\"query\"],\n",
" partial_variables={\"format_instructions\": parser.get_format_instructions()}\n",
")\n",
"\n",
"_input = prompt.format_prompt(query=actor_query)\n",
"\n",
"output = model(_input.to_string())\n",
"\n",
"parser.parse(output)"
]
},
{
"cell_type": "markdown",
"id": "61f67890",
"metadata": {},
"source": [
"<br>\n",
"<br>\n",
"<br>\n",
"<br>\n",
"\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "91871002",
"metadata": {},
"source": [
"## Structured Output Parser\n",
"\n",
"While the Pydantic/JSON parser is more powerful, we initially experimented data structures having text fields only."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b492997a",
"metadata": {},
"outputs": [],
"source": [
"from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "09473dce", "id": "09473dce",
@ -59,7 +198,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 7,
"id": "432ac44a", "id": "432ac44a",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -81,7 +220,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 8,
"id": "593cfc25", "id": "593cfc25",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -104,7 +243,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 9,
"id": "106f1ba6", "id": "106f1ba6",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -114,7 +253,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 10,
"id": "86d9d24f", "id": "86d9d24f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -125,7 +264,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 11,
"id": "956bdc99", "id": "956bdc99",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -135,7 +274,7 @@
"{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}" "{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}"
] ]
}, },
"execution_count": 7, "execution_count": 11,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -154,7 +293,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 12,
"id": "8f483d7d", "id": "8f483d7d",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -164,7 +303,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 13,
"id": "f761cbf1", "id": "f761cbf1",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -180,7 +319,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 14,
"id": "edd73ae3", "id": "edd73ae3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -191,7 +330,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 15,
"id": "a3c8b91e", "id": "a3c8b91e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -201,7 +340,7 @@
"{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}" "{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}"
] ]
}, },
"execution_count": 11, "execution_count": 15,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -217,12 +356,12 @@
"source": [ "source": [
"## CommaSeparatedListOutputParser\n", "## CommaSeparatedListOutputParser\n",
"\n", "\n",
"This output parser can be used to get a list of items as output." "Here's another parser strictly less powerful than Pydantic/JSON parsing."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 16,
"id": "872246d7", "id": "872246d7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -232,7 +371,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 17,
"id": "c3f9aee6", "id": "c3f9aee6",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -242,7 +381,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 18,
"id": "e77871b7", "id": "e77871b7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -257,7 +396,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 19,
"id": "a71cb5d3", "id": "a71cb5d3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -267,7 +406,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 20,
"id": "783d7d98", "id": "783d7d98",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -278,7 +417,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 21,
"id": "fcb81344", "id": "fcb81344",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -292,7 +431,7 @@
" 'Cookies and Cream']" " 'Cookies and Cream']"
] ]
}, },
"execution_count": 17, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -300,14 +439,6 @@
"source": [ "source": [
"output_parser.parse(output)" "output_parser.parse(output)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cba6d8e3",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -326,7 +457,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.1" "version": "3.9.0"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -3,6 +3,7 @@ from langchain.output_parsers.list import (
CommaSeparatedListOutputParser, CommaSeparatedListOutputParser,
ListOutputParser, ListOutputParser,
) )
from langchain.output_parsers.pydantic import PydanticOutputParser
from langchain.output_parsers.rail_parser import GuardrailsOutputParser from langchain.output_parsers.rail_parser import GuardrailsOutputParser
from langchain.output_parsers.regex import RegexParser from langchain.output_parsers.regex import RegexParser
from langchain.output_parsers.regex_dict import RegexDictParser from langchain.output_parsers.regex_dict import RegexDictParser
@ -17,4 +18,5 @@ __all__ = [
"StructuredOutputParser", "StructuredOutputParser",
"ResponseSchema", "ResponseSchema",
"GuardrailsOutputParser", "GuardrailsOutputParser",
"PydanticOutputParser",
] ]

View File

@ -7,3 +7,10 @@ STRUCTURED_FORMAT_INSTRUCTIONS = """The output should be a markdown code snippet
{format} {format}
}} }}
```""" ```"""
PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below. For example, the object {{"foo": ["bar", "baz"]}} conforms to the schema {{"foo": {{"description": "a list of strings field", "type": "string"}}}}.
Here is the output schema:
```
{schema}
```"""

View File

@ -0,0 +1,40 @@
import json
import re
from typing import Any
from pydantic import BaseModel, ValidationError
from langchain.output_parsers.base import BaseOutputParser
from langchain.output_parsers.format_instructions import PYDANTIC_FORMAT_INSTRUCTIONS
class PydanticOutputParser(BaseOutputParser):
pydantic_object: Any
def parse(self, text: str) -> BaseModel:
try:
# Greedy search for 1st json candidate.
match = re.search("\{.*\}", text.strip())
json_str = ""
if match:
json_str = match.group()
json_object = json.loads(json_str)
return self.pydantic_object.parse_obj(json_object)
except (json.JSONDecodeError, ValidationError) as e:
name = self.pydantic_object.__name__
msg = f"Failed to parse {name} from completion {text}. Got: {e}"
raise ValueError(msg)
def get_format_instructions(self) -> str:
schema = self.pydantic_object.schema()
# Remove extraneous fields.
reduced_schema = {
prop: {"description": data["description"], "type": data["type"]}
for prop, data in schema["properties"].items()
}
# Ensure json in context is well-formed with double quotes.
schema = json.dumps(reduced_schema)
return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema)