Add XMLOutputParser (#10051)

**Description** Adds new output parser, this time enabling the output of LLM to be of an XML format. Seems to be particularly useful together with Claude model. Addresses [issue 9820](https://github.com/langchain-ai/langchain/issues/9820). **Twitter handle** @deepsense_ai @matt_wosinski
1 year ago · 720f6dbaac
parent d6df288380
commit 720f6dbaac
5 changed files with 465 additions and 0 deletions
--- a/docs/extras/modules/model_io/output_parsers/xml.ipynb
+++ b/docs/extras/modules/model_io/output_parsers/xml.ipynb
@ -0,0 +1,358 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "181b5b6d",
+   "metadata": {},
+   "source": [
+    "# XML parser\n",
+    "This output parser allows users to obtain results from LLM in the popular XML format. \n",
+    "\n",
+    "Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate well-formed XML. \n",
+    "\n",
+    "In the following example we use Claude model (https://docs.anthropic.com/claude/docs) which works really well with XML tags."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3b10fc55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.llms import Anthropic\n",
+    "from langchain.output_parsers import XMLOutputParser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "909161d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mateusz/Documents/Projects/langchain/libs/langchain/langchain/llms/anthropic.py:170: UserWarning: This Anthropic LLM is deprecated. Please use `from langchain.chat_models import ChatAnthropic` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = Anthropic(model=\"claude-2\", max_tokens_to_sample=512, temperature=0.1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da312f86-0d2a-4aef-a09d-1e72bd0ea9b1",
+   "metadata": {},
+   "source": [
+    "Let's start with the simple request to the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b03785af-69fc-40a1-a1be-c04ed6fade70",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Here is the shortened filmography for Tom Hanks enclosed in <movie> tags:\n",
+      "\n",
+      "<movie>Splash (1984)</movie>\n",
+      "<movie>Big (1988)</movie> \n",
+      "<movie>A League of Their Own (1992)</movie>\n",
+      "<movie>Sleepless in Seattle (1993)</movie>  \n",
+      "<movie>Forrest Gump (1994)</movie>\n",
+      "<movie>Apollo 13 (1995)</movie>\n",
+      "<movie>Toy Story (1995)</movie>\n",
+      "<movie>Saving Private Ryan (1998)</movie>\n",
+      "<movie>Cast Away (2000)</movie>\n",
+      "<movie>The Da Vinci Code (2006)</movie>\n",
+      "<movie>Toy Story 3 (2010)</movie>\n",
+      "<movie>Captain Phillips (2013)</movie>\n",
+      "<movie>Bridge of Spies (2015)</movie>\n",
+      "<movie>Toy Story 4 (2019)</movie>\n"
+     ]
+    }
+   ],
+   "source": [
+    "actor_query = \"Generate the shortened filmography for Tom Hanks.\"\n",
+    "output = model(\n",
+    "    f\"\"\"\n",
+    "\n",
+    "Human:\n",
+    "{actor_query}\n",
+    "Please enclose the movies in <movie></movie> tags\n",
+    "Assistant:\n",
+    "\"\"\"\n",
+    ")\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4db65781-3d54-4ba6-ae26-5b4ead47a4c8",
+   "metadata": {},
+   "source": [
+    "Now we will use the XMLOutputParser in order to get the structured output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "87ba8d11",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "<filmography>\n",
+      "  <movie>\n",
+      "    <title>Splash</title>\n",
+      "    <year>1984</year>\n",
+      "  </movie>\n",
+      "  \n",
+      "  <movie>\n",
+      "    <title>Big</title>  \n",
+      "    <year>1988</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>A League of Their Own</title>\n",
+      "    <year>1992</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Sleepless in Seattle</title>\n",
+      "    <year>1993</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Forrest Gump</title>\n",
+      "    <year>1994</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Toy Story</title>\n",
+      "    <year>1995</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Apollo 13</title>\n",
+      "    <year>1995</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Saving Private Ryan</title>\n",
+      "    <year>1998</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Cast Away</title>\n",
+      "    <year>2000</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Catch Me If You Can</title>\n",
+      "    <year>2002</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>The Polar Express</title>\n",
+      "    <year>2004</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Charlie Wilson's War</title>\n",
+      "    <year>2007</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Toy Story 3</title>\n",
+      "    <year>2010</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Captain Phillips</title>\n",
+      "    <year>2013</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>Bridge of Spies</title>\n",
+      "    <year>2015</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>The Post</title>\n",
+      "    <year>2017</year>\n",
+      "  </movie>\n",
+      "\n",
+      "  <movie>\n",
+      "    <title>A Beautiful Day in the Neighborhood</title> \n",
+      "    <year>2019</year>\n",
+      "  </movie>\n",
+      "</filmography>\n"
+     ]
+    }
+   ],
+   "source": [
+    "parser = XMLOutputParser()\n",
+    "\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"\"\"\n",
+    "    \n",
+    "    Human:\n",
+    "    {query}\n",
+    "    {format_instructions}\n",
+    "    Assistant:\"\"\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "_input = prompt.format_prompt(query=actor_query)\n",
+    "\n",
+    "output = model(_input.to_string())\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c4c47ee",
+   "metadata": {},
+   "source": [
+    "And here parsed output is shown:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4c864dc9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'filmography': [{'movie': [{'title': 'Splash'}, {'year': '1984'}]},\n",
+       "  {'movie': [{'title': 'Big'}, {'year': '1988'}]},\n",
+       "  {'movie': [{'title': 'A League of Their Own'}, {'year': '1992'}]},\n",
+       "  {'movie': [{'title': 'Sleepless in Seattle'}, {'year': '1993'}]},\n",
+       "  {'movie': [{'title': 'Forrest Gump'}, {'year': '1994'}]},\n",
+       "  {'movie': [{'title': 'Toy Story'}, {'year': '1995'}]},\n",
+       "  {'movie': [{'title': 'Apollo 13'}, {'year': '1995'}]},\n",
+       "  {'movie': [{'title': 'Saving Private Ryan'}, {'year': '1998'}]},\n",
+       "  {'movie': [{'title': 'Cast Away'}, {'year': '2000'}]},\n",
+       "  {'movie': [{'title': 'Catch Me If You Can'}, {'year': '2002'}]},\n",
+       "  {'movie': [{'title': 'The Polar Express'}, {'year': '2004'}]},\n",
+       "  {'movie': [{'title': \"Charlie Wilson's War\"}, {'year': '2007'}]},\n",
+       "  {'movie': [{'title': 'Toy Story 3'}, {'year': '2010'}]},\n",
+       "  {'movie': [{'title': 'Captain Phillips'}, {'year': '2013'}]},\n",
+       "  {'movie': [{'title': 'Bridge of Spies'}, {'year': '2015'}]},\n",
+       "  {'movie': [{'title': 'The Post'}, {'year': '2017'}]},\n",
+       "  {'movie': [{'title': 'A Beautiful Day in the Neighborhood'},\n",
+       "    {'year': '2019'}]}]}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parser.parse(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "327f5479-77e0-4549-8393-2cd7a286d491",
+   "metadata": {},
+   "source": [
+    "Finally, let's add some tags to tailor the output to our needs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b722a235",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'movies': [{'actor': [{'name': 'Tom Hanks'},\n",
+       "    {'film': [{'name': 'Splash'}, {'genre': 'Comedy'}]},\n",
+       "    {'film': [{'name': 'Big'}, {'genre': 'Comedy'}]},\n",
+       "    {'film': [{'name': 'A League of Their Own'}, {'genre': 'Drama'}]},\n",
+       "    {'film': [{'name': 'Sleepless in Seattle'}, {'genre': 'Romance'}]},\n",
+       "    {'film': [{'name': 'Forrest Gump'}, {'genre': 'Drama'}]},\n",
+       "    {'film': [{'name': 'Toy Story'}, {'genre': 'Animation'}]},\n",
+       "    {'film': [{'name': 'Apollo 13'}, {'genre': 'Drama'}]},\n",
+       "    {'film': [{'name': 'Saving Private Ryan'}, {'genre': 'War'}]},\n",
+       "    {'film': [{'name': 'Cast Away'}, {'genre': 'Adventure'}]},\n",
+       "    {'film': [{'name': 'The Green Mile'}, {'genre': 'Drama'}]}]}]}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parser = XMLOutputParser(tags=[\"movies\", \"actor\", \"film\", \"name\", \"genre\"])\n",
+    "prompt = PromptTemplate(\n",
+    "    template=\"\"\"\n",
+    "    \n",
+    "    Human:\n",
+    "    {query}\n",
+    "    {format_instructions}\n",
+    "    Assistant:\"\"\",\n",
+    "    input_variables=[\"query\"],\n",
+    "    partial_variables={\"format_instructions\": parser.get_format_instructions()},\n",
+    ")\n",
+    "\n",
+    "\n",
+    "_input = prompt.format_prompt(query=actor_query)\n",
+    "\n",
+    "output = model(_input.to_string())\n",
+    "\n",
+    "parser.parse(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "808a5df5-b11e-42a0-bd7a-6b95ca0c3eba",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/output_parsers/init.py
+++ b/libs/langchain/langchain/output_parsers/init.py
@ -28,6 +28,7 @@ from langchain.output_parsers.regex import RegexParser
 from langchain.output_parsers.regex_dict import RegexDictParser
 from langchain.output_parsers.retry import RetryOutputParser, RetryWithErrorOutputParser
 from langchain.output_parsers.structured import ResponseSchema, StructuredOutputParser
+from langchain.output_parsers.xml import XMLOutputParser

 __all__ = [
    "BooleanOutputParser",
@ -46,4 +47,5 @@ __all__ = [
    "RetryOutputParser",
    "RetryWithErrorOutputParser",
    "StructuredOutputParser",
+    "XMLOutputParser",
 ]
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@ -25,3 +25,19 @@ Here is the output schema:
 ```
 {schema}
 ```"""
+
+
+XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
+1. Output should conform to the tags below. 
+2. If tags are not given, make them on your own.
+3. Remember to always open and close all the tags.
+
+As an example, for the tags ["foo", "bar", "baz"]:
+1. String "<foo>\n   <bar>\n      <baz></baz>\n   </bar>\n</foo>" is a well-formatted instance of the schema. 
+2. String "<foo>\n   <bar>\n   </foo>" is a badly-formatted instance.
+3. String "<foo>\n   <tag>\n   </tag>\n</foo>" is a badly-formatted instance.
+
+Here are the output tags:
+```
+{tags}
+```"""
--- a/libs/langchain/langchain/output_parsers/xml.py
+++ b/libs/langchain/langchain/output_parsers/xml.py
@ -0,0 +1,45 @@
+import re
+import xml.etree.ElementTree as ET
+from typing import Any, Dict, List, Optional
+
+from langchain.output_parsers.format_instructions import XML_FORMAT_INSTRUCTIONS
+from langchain.schema import BaseOutputParser
+
+
+class XMLOutputParser(BaseOutputParser):
+    """Parse an output using xml format."""
+
+    tags: Optional[List[str]] = None
+    encoding_matcher: re.Pattern = re.compile(
+        r"<([^>]*encoding[^>]*)>\n(.*)", re.MULTILINE | re.DOTALL
+    )
+
+    def get_format_instructions(self) -> str:
+        return XML_FORMAT_INSTRUCTIONS.format(tags=self.tags)
+
+    def parse(self, text: str) -> Dict[str, List[Any]]:
+        text = text.strip("`").strip("xml")
+        encoding_match = self.encoding_matcher.search(text)
+        if encoding_match:
+            text = encoding_match.group(2)
+        if (text.startswith("<") or text.startswith("\n<")) and (
+            text.endswith(">") or text.endswith(">\n")
+        ):
+            root = ET.fromstring(text)
+            return self._root_to_dict(root)
+        else:
+            raise ValueError(f"Could not parse output: {text}")
+
+    def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]:
+        """Converts xml tree to python dictionary."""
+        result: Dict[str, List[Any]] = {root.tag: []}
+        for child in root:
+            if len(child) == 0:
+                result[root.tag].append({child.tag: child.text})
+            else:
+                result[root.tag].append(self._root_to_dict(child))
+        return result
+
+    @property
+    def _type(self) -> str:
+        return "xml"
--- a/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py
@ -0,0 +1,44 @@
+"""Test XMLOutputParser"""
+import pytest
+
+from langchain.output_parsers.xml import XMLOutputParser
+
+DEF_RESULT_ENCODING = """<?xml version="1.0" encoding="UTF-8"?>
+<foo>
+    <bar>
+        <baz></baz>
+        <baz>slim.shady</baz>
+    </bar>
+    <baz>tag</baz>
+</foo>"""
+
+DEF_RESULT_EXPECTED = {
+    "foo": [
+        {"bar": [{"baz": None}, {"baz": "slim.shady"}]},
+        {"baz": "tag"},
+    ],
+}
+
+
+@pytest.mark.parametrize(
+    "result",
+    [DEF_RESULT_ENCODING, DEF_RESULT_ENCODING[DEF_RESULT_ENCODING.find("\n") :]],
+)
+def test_xml_output_parser(result: str) -> None:
+    """Test XMLOutputParser."""
+
+    xml_parser = XMLOutputParser()
+
+    xml_result = xml_parser.parse(result)
+    assert DEF_RESULT_EXPECTED == xml_result
+
+
+@pytest.mark.parametrize("result", ["foo></foo>", "<foo></foo", "foo></foo", "foofoo"])
+def test_xml_output_parser_fail(result: str) -> None:
+    """Test XMLOutputParser where complete output is not in XML format."""
+
+    xml_parser = XMLOutputParser()
+
+    with pytest.raises(ValueError) as e:
+        xml_parser.parse(result)
+    assert "Could not parse output" in str(e)