forked from Archives/langchain
pydantic/json parsing (#1722)
``` class Joke(BaseModel): setup: str = Field(description="question to set up a joke") punchline: str = Field(description="answer to resolve the joke") joke_query = "Tell me a joke." # Or, an example with compound type fields. #class FloatArray(BaseModel): # values: List[float] = Field(description="list of floats") # #float_array_query = "Write out a few terms of fiboacci." model = OpenAI(model_name='text-davinci-003', temperature=0.0) parser = PydanticOutputParser(pydantic_object=Joke) prompt = PromptTemplate( template="Answer the user query.\n{format_instructions}\n{query}\n", input_variables=["query"], partial_variables={"format_instructions": parser.get_format_instructions()} ) _input = prompt.format_prompt(query=joke_query) print("Prompt:\n", _input.to_string()) output = model(_input.to_string()) print("Completion:\n", output) parsed_output = parser.parse(output) print("Parsed completion:\n", parsed_output) ``` ``` Prompt: Answer the user query. The output should be formatted as a JSON instance that conforms to the JSON schema below. For example, the object {"foo": ["bar", "baz"]} conforms to the schema {"foo": {"description": "a list of strings field", "type": "string"}}. Here is the output schema: --- {"setup": {"description": "question to set up a joke", "type": "string"}, "punchline": {"description": "answer to resolve the joke", "type": "string"}} --- Tell me a joke. Completion: {"setup": "Why don't scientists trust atoms?", "punchline": "Because they make up everything!"} Parsed completion: setup="Why don't scientists trust atoms?" punchline='Because they make up everything!' ``` Ofc, works only with LMs of sufficient capacity. DaVinci is reliable but not always. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
d6bbf395af
commit
55efbb8a7e
@ -17,30 +17,10 @@
|
||||
"Below we go over some examples of output parsers."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "91871002",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Structured Output Parser\n",
|
||||
"\n",
|
||||
"This output parser can be used when you want to return multiple fields."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b492997a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "ffb7fc57",
|
||||
"id": "5f0c8a33",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -49,6 +29,165 @@
|
||||
"from langchain.chat_models import ChatOpenAI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a1ae632a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## PydanticOutputParser\n",
|
||||
"This output parser allows users to specify an arbitrary JSON schema and query LLMs for JSON outputs that conform to that schema.\n",
|
||||
"\n",
|
||||
"Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed JSON. In the OpenAI family, DaVinci can do reliably but Curie's ability already drops off dramatically. \n",
|
||||
"\n",
|
||||
"Use Pydantic to declare your data model. Pydantic's BaseModel like a Python dataclass, but with actual type checking + coercion."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "cba6d8e3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.output_parsers import PydanticOutputParser\n",
|
||||
"from pydantic import BaseModel, Field, validator\n",
|
||||
"from typing import List"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "0a203100",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = 'text-davinci-003'\n",
|
||||
"temperature = 0.0\n",
|
||||
"model = OpenAI(model_name=model_name, temperature=temperature)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "b3f16168",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Joke(setup='Why did the chicken cross the playground?', punchline='To get to the other slide!')"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Define your desired data structure.\n",
|
||||
"class Joke(BaseModel):\n",
|
||||
" setup: str = Field(description=\"question to set up a joke\")\n",
|
||||
" punchline: str = Field(description=\"answer to resolve the joke\")\n",
|
||||
" \n",
|
||||
" # You can add custom validation logic easily with Pydantic.\n",
|
||||
" @validator('setup')\n",
|
||||
" def question_ends_with_question_mark(cls, field):\n",
|
||||
" if field[-1] != '?':\n",
|
||||
" raise ValueError(\"Badly formed question!\")\n",
|
||||
" return field\n",
|
||||
"\n",
|
||||
"# And a query intented to prompt a language model to populate the data structure.\n",
|
||||
"joke_query = \"Tell me a joke.\"\n",
|
||||
"\n",
|
||||
"# Set up a parser + inject instructions into the prompt template.\n",
|
||||
"parser = PydanticOutputParser(pydantic_object=Joke)\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
|
||||
" input_variables=[\"query\"],\n",
|
||||
" partial_variables={\"format_instructions\": parser.get_format_instructions()}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"_input = prompt.format_prompt(query=joke_query)\n",
|
||||
"\n",
|
||||
"output = model(_input.to_string())\n",
|
||||
"\n",
|
||||
"parser.parse(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "03049f88",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Actor(name='Tom Hanks', film_names=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Cast Away', 'Toy Story', 'A League of Their Own'])"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Here's another example, but with a compound typed field.\n",
|
||||
"class Actor(BaseModel):\n",
|
||||
" name: str = Field(description=\"name of an actor\")\n",
|
||||
" film_names: List[str] = Field(description=\"list of names of films they starred in\")\n",
|
||||
" \n",
|
||||
"actor_query = \"Generate the filmography for a random actor.\"\n",
|
||||
"\n",
|
||||
"parser = PydanticOutputParser(pydantic_object=Actor)\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
|
||||
" input_variables=[\"query\"],\n",
|
||||
" partial_variables={\"format_instructions\": parser.get_format_instructions()}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"_input = prompt.format_prompt(query=actor_query)\n",
|
||||
"\n",
|
||||
"output = model(_input.to_string())\n",
|
||||
"\n",
|
||||
"parser.parse(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "61f67890",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"<br>\n",
|
||||
"\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "91871002",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Structured Output Parser\n",
|
||||
"\n",
|
||||
"While the Pydantic/JSON parser is more powerful, we initially experimented data structures having text fields only."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b492997a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "09473dce",
|
||||
@ -59,7 +198,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 7,
|
||||
"id": "432ac44a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -81,7 +220,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 8,
|
||||
"id": "593cfc25",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -104,7 +243,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 9,
|
||||
"id": "106f1ba6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -114,7 +253,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 10,
|
||||
"id": "86d9d24f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -125,7 +264,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 11,
|
||||
"id": "956bdc99",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -135,7 +274,7 @@
|
||||
"{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -154,7 +293,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 12,
|
||||
"id": "8f483d7d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -164,7 +303,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 13,
|
||||
"id": "f761cbf1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -180,7 +319,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 14,
|
||||
"id": "edd73ae3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -191,7 +330,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 15,
|
||||
"id": "a3c8b91e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -201,7 +340,7 @@
|
||||
"{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -217,12 +356,12 @@
|
||||
"source": [
|
||||
"## CommaSeparatedListOutputParser\n",
|
||||
"\n",
|
||||
"This output parser can be used to get a list of items as output."
|
||||
"Here's another parser strictly less powerful than Pydantic/JSON parsing."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 16,
|
||||
"id": "872246d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -232,7 +371,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 17,
|
||||
"id": "c3f9aee6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -242,7 +381,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 18,
|
||||
"id": "e77871b7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -257,7 +396,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 19,
|
||||
"id": "a71cb5d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -267,7 +406,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 20,
|
||||
"id": "783d7d98",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -278,7 +417,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 21,
|
||||
"id": "fcb81344",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -292,7 +431,7 @@
|
||||
" 'Cookies and Cream']"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -300,14 +439,6 @@
|
||||
"source": [
|
||||
"output_parser.parse(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cba6d8e3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -326,7 +457,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.9.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -3,6 +3,7 @@ from langchain.output_parsers.list import (
|
||||
CommaSeparatedListOutputParser,
|
||||
ListOutputParser,
|
||||
)
|
||||
from langchain.output_parsers.pydantic import PydanticOutputParser
|
||||
from langchain.output_parsers.rail_parser import GuardrailsOutputParser
|
||||
from langchain.output_parsers.regex import RegexParser
|
||||
from langchain.output_parsers.regex_dict import RegexDictParser
|
||||
@ -17,4 +18,5 @@ __all__ = [
|
||||
"StructuredOutputParser",
|
||||
"ResponseSchema",
|
||||
"GuardrailsOutputParser",
|
||||
"PydanticOutputParser",
|
||||
]
|
||||
|
@ -7,3 +7,10 @@ STRUCTURED_FORMAT_INSTRUCTIONS = """The output should be a markdown code snippet
|
||||
{format}
|
||||
}}
|
||||
```"""
|
||||
|
||||
PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below. For example, the object {{"foo": ["bar", "baz"]}} conforms to the schema {{"foo": {{"description": "a list of strings field", "type": "string"}}}}.
|
||||
|
||||
Here is the output schema:
|
||||
```
|
||||
{schema}
|
||||
```"""
|
||||
|
40
langchain/output_parsers/pydantic.py
Normal file
40
langchain/output_parsers/pydantic.py
Normal file
@ -0,0 +1,40 @@
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, ValidationError
|
||||
|
||||
from langchain.output_parsers.base import BaseOutputParser
|
||||
from langchain.output_parsers.format_instructions import PYDANTIC_FORMAT_INSTRUCTIONS
|
||||
|
||||
|
||||
class PydanticOutputParser(BaseOutputParser):
|
||||
pydantic_object: Any
|
||||
|
||||
def parse(self, text: str) -> BaseModel:
|
||||
try:
|
||||
# Greedy search for 1st json candidate.
|
||||
match = re.search("\{.*\}", text.strip())
|
||||
json_str = ""
|
||||
if match:
|
||||
json_str = match.group()
|
||||
json_object = json.loads(json_str)
|
||||
return self.pydantic_object.parse_obj(json_object)
|
||||
|
||||
except (json.JSONDecodeError, ValidationError) as e:
|
||||
name = self.pydantic_object.__name__
|
||||
msg = f"Failed to parse {name} from completion {text}. Got: {e}"
|
||||
raise ValueError(msg)
|
||||
|
||||
def get_format_instructions(self) -> str:
|
||||
schema = self.pydantic_object.schema()
|
||||
|
||||
# Remove extraneous fields.
|
||||
reduced_schema = {
|
||||
prop: {"description": data["description"], "type": data["type"]}
|
||||
for prop, data in schema["properties"].items()
|
||||
}
|
||||
# Ensure json in context is well-formed with double quotes.
|
||||
schema = json.dumps(reduced_schema)
|
||||
|
||||
return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema)
|
Loading…
Reference in New Issue
Block a user