Improve markdown list parser (#15295)

- do not match text after - in the middle of a sentence

<!-- Thank you for contributing to LangChain!

Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes if applicable,
  - **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
pull/15110/head^2
Nuno Campos 6 months ago committed by GitHub
parent 50e99ec601
commit ec090745a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -154,18 +154,18 @@ class NumberedListOutputParser(ListOutputParser):
class MarkdownListOutputParser(ListOutputParser):
"""Parse a markdown list."""
pattern = r"-\s([^\n]+)"
pattern = r"^\s*[-*]\s([^\n]+)$"
def get_format_instructions(self) -> str:
return "Your response should be a markdown list, " "eg: `- foo\n- bar\n- baz`"
def parse(self, text: str) -> List[str]:
"""Parse the output of an LLM call."""
return re.findall(self.pattern, text)
return re.findall(self.pattern, text, re.MULTILINE)
def parse_iter(self, text: str) -> Iterator[re.Match]:
"""Parse the output of an LLM call."""
return re.finditer(self.pattern, text)
return re.finditer(self.pattern, text, re.MULTILINE)
@property
def _type(self) -> str:

@ -147,6 +147,7 @@ TEST_CASES = [
NO_TICKS_WHITE_SPACE,
TEXT_BEFORE,
TEXT_AFTER,
TEXT_BEFORE_AND_AFTER,
]

@ -51,7 +51,7 @@ def test_numbered_list() -> None:
"For example: \n\n1. foo\n\n2. bar\n\n3. baz"
)
text2 = "Items:\n\n1. apple\n\n2. banana\n\n3. cherry"
text2 = "Items:\n\n1. apple\n\n 2. banana\n\n3. cherry"
text3 = "No items in the list."
@ -82,11 +82,11 @@ def test_numbered_list() -> None:
def test_markdown_list() -> None:
parser = MarkdownListOutputParser()
text1 = (
"Your response should be a numbered list with each item on a new line."
"Your response should be a numbered - not a list item - list with each item on a new line." # noqa: E501
"For example: \n- foo\n- bar\n- baz"
)
text2 = "Items:\n- apple\n- banana\n- cherry"
text2 = "Items:\n- apple\n - banana\n- cherry"
text3 = "No items in the list."

Loading…
Cancel
Save