From 65111eb2b39fa2d8efbe3275e52d13777b52d6a0 Mon Sep 17 00:00:00 2001 From: Yoann Poupart <66315201+Xmaster6y@users.noreply.github.com> Date: Tue, 6 Jun 2023 18:27:37 +0200 Subject: [PATCH] Attribute support for html tags (#5782) # What does this PR do? Change the HTML tags so that a tag with attributes can be found. ## Before submitting - [x] Tests added - [x] CI/CD validated ### Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- langchain/text_splitter.py | 52 +++++++++++++------------- tests/unit_tests/test_text_splitter.py | 36 ++++++++++++++++++ 2 files changed, 62 insertions(+), 26 deletions(-) diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 14f01547..1e697e66 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -740,33 +740,33 @@ class RecursiveCharacterTextSplitter(TextSplitter): elif language == Language.HTML: return [ # First, try to split along HTML tags - "", - "
", - "

", - "
", - "

  • ", - "

    ", - "

    ", - "

    ", - "

    ", - "

    ", - "
    ", - "", - "", - "", - "
    ", - "", - "
      ", - "
        ", - "
        ", - "