diff --git a/docs/modules/prompts/examples/output_parsers.ipynb b/docs/modules/prompts/examples/output_parsers.ipynb
index 76b71df7..d8fc9206 100644
--- a/docs/modules/prompts/examples/output_parsers.ipynb
+++ b/docs/modules/prompts/examples/output_parsers.ipynb
@@ -17,30 +17,10 @@
"Below we go over some examples of output parsers."
]
},
- {
- "cell_type": "markdown",
- "id": "91871002",
- "metadata": {},
- "source": [
- "## Structured Output Parser\n",
- "\n",
- "This output parser can be used when you want to return multiple fields."
- ]
- },
{
"cell_type": "code",
"execution_count": 1,
- "id": "b492997a",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "ffb7fc57",
+ "id": "5f0c8a33",
"metadata": {},
"outputs": [],
"source": [
@@ -49,6 +29,165 @@
"from langchain.chat_models import ChatOpenAI"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "a1ae632a",
+ "metadata": {},
+ "source": [
+ "## PydanticOutputParser\n",
+ "This output parser allows users to specify an arbitrary JSON schema and query LLMs for JSON outputs that conform to that schema.\n",
+ "\n",
+ "Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed JSON. In the OpenAI family, DaVinci can do reliably but Curie's ability already drops off dramatically. \n",
+ "\n",
+ "Use Pydantic to declare your data model. Pydantic's BaseModel like a Python dataclass, but with actual type checking + coercion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "cba6d8e3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.output_parsers import PydanticOutputParser\n",
+ "from pydantic import BaseModel, Field, validator\n",
+ "from typing import List"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "0a203100",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_name = 'text-davinci-003'\n",
+ "temperature = 0.0\n",
+ "model = OpenAI(model_name=model_name, temperature=temperature)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "b3f16168",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Joke(setup='Why did the chicken cross the playground?', punchline='To get to the other slide!')"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Define your desired data structure.\n",
+ "class Joke(BaseModel):\n",
+ " setup: str = Field(description=\"question to set up a joke\")\n",
+ " punchline: str = Field(description=\"answer to resolve the joke\")\n",
+ " \n",
+ " # You can add custom validation logic easily with Pydantic.\n",
+ " @validator('setup')\n",
+ " def question_ends_with_question_mark(cls, field):\n",
+ " if field[-1] != '?':\n",
+ " raise ValueError(\"Badly formed question!\")\n",
+ " return field\n",
+ "\n",
+ "# And a query intented to prompt a language model to populate the data structure.\n",
+ "joke_query = \"Tell me a joke.\"\n",
+ "\n",
+ "# Set up a parser + inject instructions into the prompt template.\n",
+ "parser = PydanticOutputParser(pydantic_object=Joke)\n",
+ "\n",
+ "prompt = PromptTemplate(\n",
+ " template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+ " input_variables=[\"query\"],\n",
+ " partial_variables={\"format_instructions\": parser.get_format_instructions()}\n",
+ ")\n",
+ "\n",
+ "_input = prompt.format_prompt(query=joke_query)\n",
+ "\n",
+ "output = model(_input.to_string())\n",
+ "\n",
+ "parser.parse(output)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "03049f88",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Actor(name='Tom Hanks', film_names=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Cast Away', 'Toy Story', 'A League of Their Own'])"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Here's another example, but with a compound typed field.\n",
+ "class Actor(BaseModel):\n",
+ " name: str = Field(description=\"name of an actor\")\n",
+ " film_names: List[str] = Field(description=\"list of names of films they starred in\")\n",
+ " \n",
+ "actor_query = \"Generate the filmography for a random actor.\"\n",
+ "\n",
+ "parser = PydanticOutputParser(pydantic_object=Actor)\n",
+ "\n",
+ "prompt = PromptTemplate(\n",
+ " template=\"Answer the user query.\\n{format_instructions}\\n{query}\\n\",\n",
+ " input_variables=[\"query\"],\n",
+ " partial_variables={\"format_instructions\": parser.get_format_instructions()}\n",
+ ")\n",
+ "\n",
+ "_input = prompt.format_prompt(query=actor_query)\n",
+ "\n",
+ "output = model(_input.to_string())\n",
+ "\n",
+ "parser.parse(output)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "61f67890",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91871002",
+ "metadata": {},
+ "source": [
+ "## Structured Output Parser\n",
+ "\n",
+ "While the Pydantic/JSON parser is more powerful, we initially experimented data structures having text fields only."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "b492997a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
+ ]
+ },
{
"cell_type": "markdown",
"id": "09473dce",
@@ -59,7 +198,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 7,
"id": "432ac44a",
"metadata": {},
"outputs": [],
@@ -81,7 +220,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 8,
"id": "593cfc25",
"metadata": {},
"outputs": [],
@@ -104,7 +243,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 9,
"id": "106f1ba6",
"metadata": {},
"outputs": [],
@@ -114,7 +253,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 10,
"id": "86d9d24f",
"metadata": {},
"outputs": [],
@@ -125,7 +264,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 11,
"id": "956bdc99",
"metadata": {},
"outputs": [
@@ -135,7 +274,7 @@
"{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}"
]
},
- "execution_count": 7,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -154,7 +293,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 12,
"id": "8f483d7d",
"metadata": {},
"outputs": [],
@@ -164,7 +303,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 13,
"id": "f761cbf1",
"metadata": {},
"outputs": [],
@@ -180,7 +319,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 14,
"id": "edd73ae3",
"metadata": {},
"outputs": [],
@@ -191,7 +330,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 15,
"id": "a3c8b91e",
"metadata": {},
"outputs": [
@@ -201,7 +340,7 @@
"{'answer': 'Paris', 'source': 'https://en.wikipedia.org/wiki/Paris'}"
]
},
- "execution_count": 11,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -217,12 +356,12 @@
"source": [
"## CommaSeparatedListOutputParser\n",
"\n",
- "This output parser can be used to get a list of items as output."
+ "Here's another parser strictly less powerful than Pydantic/JSON parsing."
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 16,
"id": "872246d7",
"metadata": {},
"outputs": [],
@@ -232,7 +371,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 17,
"id": "c3f9aee6",
"metadata": {},
"outputs": [],
@@ -242,7 +381,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 18,
"id": "e77871b7",
"metadata": {},
"outputs": [],
@@ -257,7 +396,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 19,
"id": "a71cb5d3",
"metadata": {},
"outputs": [],
@@ -267,7 +406,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 20,
"id": "783d7d98",
"metadata": {},
"outputs": [],
@@ -278,7 +417,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 21,
"id": "fcb81344",
"metadata": {},
"outputs": [
@@ -292,7 +431,7 @@
" 'Cookies and Cream']"
]
},
- "execution_count": 17,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -300,14 +439,6 @@
"source": [
"output_parser.parse(output)"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cba6d8e3",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -326,7 +457,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.9.0"
}
},
"nbformat": 4,
diff --git a/langchain/output_parsers/__init__.py b/langchain/output_parsers/__init__.py
index 268f62a5..fd2f0f24 100644
--- a/langchain/output_parsers/__init__.py
+++ b/langchain/output_parsers/__init__.py
@@ -3,6 +3,7 @@ from langchain.output_parsers.list import (
CommaSeparatedListOutputParser,
ListOutputParser,
)
+from langchain.output_parsers.pydantic import PydanticOutputParser
from langchain.output_parsers.rail_parser import GuardrailsOutputParser
from langchain.output_parsers.regex import RegexParser
from langchain.output_parsers.regex_dict import RegexDictParser
@@ -17,4 +18,5 @@ __all__ = [
"StructuredOutputParser",
"ResponseSchema",
"GuardrailsOutputParser",
+ "PydanticOutputParser",
]
diff --git a/langchain/output_parsers/format_instructions.py b/langchain/output_parsers/format_instructions.py
index 3653d477..1c6639a9 100644
--- a/langchain/output_parsers/format_instructions.py
+++ b/langchain/output_parsers/format_instructions.py
@@ -7,3 +7,10 @@ STRUCTURED_FORMAT_INSTRUCTIONS = """The output should be a markdown code snippet
{format}
}}
```"""
+
+PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below. For example, the object {{"foo": ["bar", "baz"]}} conforms to the schema {{"foo": {{"description": "a list of strings field", "type": "string"}}}}.
+
+Here is the output schema:
+```
+{schema}
+```"""
diff --git a/langchain/output_parsers/pydantic.py b/langchain/output_parsers/pydantic.py
new file mode 100644
index 00000000..e441509b
--- /dev/null
+++ b/langchain/output_parsers/pydantic.py
@@ -0,0 +1,40 @@
+import json
+import re
+from typing import Any
+
+from pydantic import BaseModel, ValidationError
+
+from langchain.output_parsers.base import BaseOutputParser
+from langchain.output_parsers.format_instructions import PYDANTIC_FORMAT_INSTRUCTIONS
+
+
+class PydanticOutputParser(BaseOutputParser):
+ pydantic_object: Any
+
+ def parse(self, text: str) -> BaseModel:
+ try:
+ # Greedy search for 1st json candidate.
+ match = re.search("\{.*\}", text.strip())
+ json_str = ""
+ if match:
+ json_str = match.group()
+ json_object = json.loads(json_str)
+ return self.pydantic_object.parse_obj(json_object)
+
+ except (json.JSONDecodeError, ValidationError) as e:
+ name = self.pydantic_object.__name__
+ msg = f"Failed to parse {name} from completion {text}. Got: {e}"
+ raise ValueError(msg)
+
+ def get_format_instructions(self) -> str:
+ schema = self.pydantic_object.schema()
+
+ # Remove extraneous fields.
+ reduced_schema = {
+ prop: {"description": data["description"], "type": data["type"]}
+ for prop, data in schema["properties"].items()
+ }
+ # Ensure json in context is well-formed with double quotes.
+ schema = json.dumps(reduced_schema)
+
+ return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema)