feat: add MultiStrategy output parser

- A type of parser where many strategies can be tried before exception - A strategy is a tuple like class of (parser, predicate, name=None) - Strategies are tried if predicate is True - Strategies are tried in order, allows for fallbacks - Base interface allows existing parsers to use multiple strategies - New strategies can be added for new output errors and covered by tests
2023-06-21 18:22:35 +02:00 · 2023-06-21 18:22:35 +02:00 · 55e0e2d6ac
commit 55e0e2d6ac
parent a9108c1809
10 changed files with 376 additions and 3 deletions
--- a/langchain/output_parsers/json.py
+++ b/langchain/output_parsers/json.py
@ -2,14 +2,22 @@ from __future__ import annotations
 import json
 import re
-from typing import List
+from typing import List, Optional
 from langchain.schema import OutputParserException
 REGEXES = {
    "json_markdown": r"```(json)?(.*?)```",
    # must use greedy matching to match the outermost code block
    "nested_json_md_code_block": r"```(json)?(.*)```",
 }
-def parse_json_markdown(json_string: str) -> dict:
+
 def parse_json_markdown(json_string: str, regex: Optional[str] = None) -> dict:
    # Try to find JSON string within triple backticks
-    match = re.search(r"```(json)?(.*?)```", json_string, re.DOTALL)
+    if regex is None:
        regex = REGEXES["json_markdown"]
    match = re.search(regex, json_string, re.DOTALL)
    # If no match found, assume the entire string is a JSON string
    if match is None:
@ -27,6 +35,56 @@ def parse_json_markdown(json_string: str) -> dict:
    return parsed
 def fix_code_in_json(text: str) -> str:
    """Fixes nested code block in json markdown"""
    # Extract the code block and replace it with a placeholder
    pattern = r"```([^`]*?)```"
    match = re.search(pattern, text)
    if match:
        code_block = match.group(1)
        text = re.sub(pattern, "CODE_BLOCK_PLACEHOLDER", text, count=1)
        # Escape the special characters in the code block
        escaped_code_block = (
            code_block.replace("\n", "\\n").replace("\t", "\\t").replace('"', '\\"')
        )
        # Add backtick pairs to escaped code block
        escaped_code_block = "[BEGIN_CODE]" + escaped_code_block + "[END_CODE]"
        # Replace the placeholder in the original text with the escaped code block
        text = text.replace("CODE_BLOCK_PLACEHOLDER", escaped_code_block)
    return text
 def fix_json_with_embedded_code_block(text: str, max_loop: int = 20) -> dict:
    """Try to fix json with embedded code block.
    Args:
        text: JSON string with embedded code block
        max_loop: Maximum number of loops to try fixing the JSON string
    """
    loop = 0
    while True:
        if loop > max_loop:
            raise ValueError("Max loop reached")
        try:
            text = fix_code_in_json(text)
            json.loads(text)
            break
        except json.JSONDecodeError as e:
            if text[e.pos] == "\n":
                text = text[: e.pos] + "\\n" + text[e.pos + 1 :]
                text = text.replace("[BEGIN_CODE]", "```")
            else:
                raise
        finally:
            loop += 1
    final_text = text.replace("[END_CODE]", "```")
    return json.loads(final_text)
 def parse_and_check_json_markdown(text: str, expected_keys: List[str]) -> dict:
    try:
        json_obj = parse_json_markdown(text)
--- a/langchain/output_parsers/multi_strategy/init.py
+++ b/langchain/output_parsers/multi_strategy/init.py
--- a/langchain/output_parsers/multi_strategy/agent.py
+++ b/langchain/output_parsers/multi_strategy/agent.py
@ -0,0 +1,38 @@
 """Multi strategy parser that implements AgentOutputParser."""
 from typing import Any, Sequence, Union
 from langchain.agents.agent import AgentOutputParser
 from langchain.agents.conversational_chat.prompt import FORMAT_INSTRUCTIONS
 from langchain.output_parsers.multi_strategy import strategies
 from langchain.output_parsers.multi_strategy.base import (
    MultiStrategyParser,
    ParseStrategy,
 )
 from langchain.schema import (
    AgentAction,
    AgentFinish,
 )
 U = Union[AgentAction, AgentFinish]
 TReactAgentOutput = U
 class ConvMultiStrategyParser(MultiStrategyParser[U, dict], AgentOutputParser):
    """Multi strategy parser that implements AgentOutputParser."""
    def get_format_instructions(self) -> str:
        return FORMAT_INSTRUCTIONS
    def __init__(self, strategies: Sequence[ParseStrategy[dict]],
                 **kwargs: dict) -> None:
        super().__init__(strategies=strategies, **kwargs)
    def final_parse(self, text: str, parsed: dict) -> U:
        action, action_input = parsed["action"], parsed["action_input"]
        if action == "Final Answer":
            return AgentFinish({"output": action_input}, text)
        else:
            return AgentAction(action, action_input, text)
 default_parser = ConvMultiStrategyParser(strategies.json_react_strategies)
--- a/langchain/output_parsers/multi_strategy/base.py
+++ b/langchain/output_parsers/multi_strategy/base.py
@ -0,0 +1,117 @@
 """Multi strategy output parser."""
 import logging
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Generic, Iterator, Sequence, TypeVar, Union, Optional
 from langchain.schema import (
    BaseOutputParser,
    OutputParserException,
 )
 log = logging.getLogger(__name__)
 T = TypeVar("T")
 S = TypeVar("S")
 TPredicate = Callable[[str], bool]
 TParser = Callable[[str], S]
 class ParseStrategy(Generic[S]):
    """A strategy is a pair of (parser, predicate).
    This class behave like a tuple for easy definition of multiple strategies.
    """
    def __init__(
        self, parser: TParser[S], predicate: TPredicate, name: Optional[str] = None
    ):
        assert callable(parser), "first argument <parser> must be callable"
        self.parser = parser
        assert callable(predicate), "second argument <predicate> must be callable"
        self.predicate = predicate
        self.name = name
    def __repr__(self) -> str:
        if self.name is None:
            return f"ParseStrategy(parser={self.parser}," "predicate={self.predicate})"
        return (
            f"ParseStrategy[{self.name}](parser={self.parser},"
            "predicate={self.predicate})"
        )
    def __getitem__(self, index: int) -> Union[TParser[S], TPredicate]:
        """Behaves like a tuple."""
        if index == 0:
            return self.parser
        elif index == 1:
            return self.predicate
        else:
            raise IndexError("tuple index out of range")
    def __iter__(self) -> Iterator[Any]:
        """Implement tuple unpacking."""
        yield self.parser
        yield self.predicate
 class MultiStrategyParser(BaseOutputParser[T], ABC, Generic[T, S]):
    """Try multiple strategies to parse the output.
    A strategy is a tuple of (parser, predicate). The parser takes the some
    text as input and returns some type S. The parser is only called if the
    predicate returns True.
    When the `parse` method is called, all registered strategies are tried
    in order and the first one that succeeds returns its result.
    The returned value of type `S` is then passed to the final_parse method to
    produce the final result compatible with the inhertited output parser
    interface.
    Appending a strategy to the end makes it a fallback strategy.
    """
    class Config:
        arbitrary_types_allowed = True
    strategies: Sequence[ParseStrategy[S]]
    """List of strategies to try. The first one that succeeds is returned."""
    def add_strategy(self, *strategy: ParseStrategy[S]) -> None:
        """Register a new strategy.
        A strategy is a callbale that takes in text as `str` and returns
        some type `S`.
        """
        self.strategies = [*self.strategies, *strategy]
    @abstractmethod
    def final_parse(self, text: str, parsed: S) -> T:
        """Parse the output of a strategy."""
    def parse(self, text: str) -> T:
        """Try the registered strategies in order.
        Returns the output of the first succeeding strategy."""
        if len(self.strategies) == 0:
            raise OutputParserException("No strategy available")
        for strategy, predicate in self.strategies:
            log.debug(f"trying strategy {strategy}")
            if not predicate(text):
                log.debug(f"Skipping strategy {strategy}")
            if predicate(text):
                try:
                    parsed = strategy(text)
                    result = self.final_parse(text, parsed)
                    log.debug(f"Strategy {strategy} succeeded")
                    return result
                except Exception:
                    continue
        raise OutputParserException(f"Could not parse output: {text}")
    @property
    def _type(self) -> str:
        return "multi_strategy"
--- a/langchain/output_parsers/multi_strategy/strategies.py
+++ b/langchain/output_parsers/multi_strategy/strategies.py
@ -0,0 +1,49 @@
 """Strategies used with MultiStrategyParser parsers."""
 import json
 from langchain.output_parsers.json import (
    REGEXES,
    fix_json_with_embedded_code_block,
    parse_json_markdown,
 )
 from langchain.output_parsers.multi_strategy.base import ParseStrategy
 def is_bare_json(text: str) -> dict:
    """Tries to load as bare json"""
    return json.loads(text.strip())
 def json_markdown(text: str) -> dict:
    """Extract a json object from markdown markup"""
    return parse_json_markdown(text)
 def json_nested_md_code_block(text: str) -> dict:
    """Extract the outermost code block. Can accomodate nested code blocks."""
    return parse_json_markdown(text, regex=REGEXES["nested_json_md_code_block"])
 def fallback(text: str) -> dict:
    """Example fallback strategy."""
    return {"action": "Final Answer", "action_input": text}
 # The order of the strategies is important
 # They are tried in order and the first one that matches is used
 json_react_strategies = (
    ParseStrategy(is_bare_json, lambda text: text.startswith("{"), name="bare_json"),
    ParseStrategy(json_markdown, lambda text: text.find("```") != -1),
    ParseStrategy(
        json_nested_md_code_block,
        lambda text: text.find("```") != -1,
        name="nested_code_block",
    ),
    ParseStrategy(
        fix_json_with_embedded_code_block,
        lambda text: text.find("```") != -1,
        name="fix_embedded_code_block",
    ),
    # this is where a fallback would go
    # ParseStrategy(fallback, lambda _: True),
 )
--- a/tests/unit_tests/data/llm_outputs/bare_json
+++ b/tests/unit_tests/data/llm_outputs/bare_json
@ -0,0 +1,4 @@
 {
    "action": "Final Answer",
    "action_input": "To implement a Singleton class in Python, you can define a class with a private constructor, a class variable to store the instance and a static method to get the instance. Here's an example:\n\n```python\nclass Singleton:\n    __instance = None\n\n    def __init__(self):\n        if Singleton.__instance != None:\n            raise Exception('You cannot create more than one instance of Singleton class.')\n        else:\n            Singleton.__instance = self\n\n    @staticmethod \n    def getInstance():\n        if Singleton.__instance == None:\n            Singleton()\n        return Singleton.__instance\n```"
 }
--- a/tests/unit_tests/data/llm_outputs/bare_json_embed_code_block
+++ b/tests/unit_tests/data/llm_outputs/bare_json_embed_code_block
@ -0,0 +1,25 @@
 {
    "action": "Final Answer",
    "action_input": "Sure, here is a simple pseudo code representation of the proof of work algorithm:
 ```
 function proofOfWork(block, difficulty):
    target = "0" * difficulty
    nonce = 0
    while True:
        hash = calculateHash(block, nonce)
        if hash.startswith(target):
            return nonce
        nonce += 1
 block = getBlockData()
 difficulty = getDifficulty()
 nonce = proofOfWork(block, difficulty)
 ```
 In this pseudo code, the `proofOfWork` function takes a `block` and a `difficulty` as input. It initializes a `target` string with the desired number of leading zeros based on the difficulty. The function then starts a loop and calculates the hash of the `block` with an incremented `nonce` value. If the hash starts with the required number of zeros, the function returns the `nonce`. Otherwise, it increments the `nonce` and continues the loop until a valid solution is found.
 To use the proof of work algorithm, you would need to provide the `block` data and the desired `difficulty` level. The algorithm will return the `nonce` value that satisfies the proof of work requirements.
 Please note that this is a simplified representation of the algorithm and actual implementations may have additional complexities and optimizations."
 }
--- a/tests/unit_tests/data/llm_outputs/ignored_format_instructions
+++ b/tests/unit_tests/data/llm_outputs/ignored_format_instructions
@ -0,0 +1,16 @@
 Here is an example implementation of a singleton class in Python:
 ```python
 class Singleton:
    _instance = None
    def __new__(cls):
        if cls._instance is None:
            print("Creating new instance")
            cls._instance = super().__new__(cls)
        else:
            print("Using existing instance")
        return cls._instance
 ```
 In this implementation, the `_instance` variable keeps track of whether an instance of the class has already been created. The `__new__` method is called when an instance of the class is requested. If an instance has already been created, it returns that instance. Otherwise, it creates a new instance and returns that.
--- a/tests/unit_tests/data/llm_outputs/json_nested_code_block
+++ b/tests/unit_tests/data/llm_outputs/json_nested_code_block
@ -0,0 +1,10 @@
 I apologize for the previous incomplete response. Here's the response in the required format:
 ```json
 {
    "action": "Final Answer",
    "action_input": "To implement a singleton class in Python, you can use a decorator or a metaclass. Here's an example of using a decorator:\n\n```python\nfrom functools import wraps\n\ndef singleton(cls):\n    instances = {}\n\n    @wraps(cls)\n    def get_instance(*args, **kwargs):\n        if cls not in instances:\n            instances[cls] = cls(*args, **kwargs)\n        return instances[cls]\n\n    return get_instance\n\n@singleton\nclass MyClass:\n    pass\n```"
 }
 ```
 I hope this helps! Let me know if you have any other questions.
--- a/tests/unit_tests/output_parsers/test_multi_strategy.py
+++ b/tests/unit_tests/output_parsers/test_multi_strategy.py
@ -0,0 +1,56 @@
 from typing import List, Tuple, Any
 from pathlib import Path
 import pytest
 from langchain.output_parsers.multi_strategy.base import MultiStrategyParser
 from langchain.output_parsers.multi_strategy.agent import ConvMultiStrategyParser
 from langchain.output_parsers.multi_strategy import strategies
 # How the test works:
 # it loads all llm output files from the ../data/llm_outputs directory
 # For each file it tries a MultiStrategyParser with the strategies to test.
 def prepare_outputs() -> List[Tuple[str, str]]:
    outputs = []
    for path in (Path(__file__).parent.parent / "data/llm_outputs/").glob("*"):
        with open(str(path), "r") as f:
            outputs.append((f.read(), path.name))
    return outputs
 llm_outputs = prepare_outputs()
@pytest.mark.parametrize("output, name", llm_outputs, ids=[x[1] for x in llm_outputs])
 def test_json_react_strategies(
    output: str, name: str, parser: MultiStrategyParser[Any, Any]
 ) -> None:
    # the ignored test is for the fallback strategy
    if name != "ignored_format_instructions":
        _test_json_react_strategy(output, name, parser)
 def _test_json_react_strategy(
    output: str, name: str, parser: MultiStrategyParser[Any, Any]
 ) -> None:
    try:
        parser.parse(output)
    except Exception as e:
        pytest.fail(f"Error parsing output entry: {name}.")
 def test_fix_json_with_embedded_code_block() -> None:
    path = Path(__file__).parent.parent / "data/llm_outputs/bare_json_embed_code_block"
    with open(str(path), "r") as f:
        output = f.read()
    res = strategies.fix_json_with_embedded_code_block(output)
    assert type(res) == dict
    with pytest.raises(Exception):
        res = strategies.fix_json_with_embedded_code_block(output, max_loop=1)
@pytest.fixture(name="parser")
 def conv_multi_strategy_parser() -> Any:
    return ConvMultiStrategyParser(strategies.json_react_strategies)