Attribute support for html tags (#5782)

# What does this PR do?

Change the HTML tags so that a tag with attributes can be found.

## Before submitting

- [x] Tests added
- [x] CI/CD validated

### Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
searx_updates
Yoann Poupart 1 year ago committed by GitHub
parent 0cfaa76e45
commit 65111eb2b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -740,33 +740,33 @@ class RecursiveCharacterTextSplitter(TextSplitter):
elif language == Language.HTML: elif language == Language.HTML:
return [ return [
# First, try to split along HTML tags # First, try to split along HTML tags
"<body>", "<body",
"<div>", "<div",
"<p>", "<p",
"<br>", "<br",
"<li>", "<li",
"<h1>", "<h1",
"<h2>", "<h2",
"<h3>", "<h3",
"<h4>", "<h4",
"<h5>", "<h5",
"<h6>", "<h6",
"<span>", "<span",
"<table>", "<table",
"<tr>", "<tr",
"<td>", "<td",
"<th>", "<th",
"<ul>", "<ul",
"<ol>", "<ol",
"<header>", "<header",
"<footer>", "<footer",
"<nav>", "<nav",
# Head # Head
"<head>", "<head",
"<style>", "<style",
"<script>", "<script",
"<meta>", "<meta",
"<title>", "<title",
"", "",
] ]
else: else:

@ -576,3 +576,39 @@ This is a code block
"block", "block",
"```", "```",
] ]
def test_html_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.HTML, chunk_size=60, chunk_overlap=0
)
code = """
<h1>Sample Document</h1>
<h2>Section</h2>
<p id="1234">Reference content.</p>
<h2>Lists</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
<h3>A block</h3>
<div class="amazing">
<p>Some text</p>
<p>Some more text</p>
</div>
"""
chunks = splitter.split_text(code)
assert chunks == [
"<h1>Sample Document</h1>\n <h2>Section</h2>",
'<p id="1234">Reference content.</p>',
"<h2>Lists</h2>\n <ul>",
"<li>Item 1</li>\n <li>Item 2</li>",
"<li>Item 3</li>\n </ul>",
"<h3>A block</h3>",
'<div class="amazing">',
"<p>Some text</p>",
"<p>Some more text</p>\n </div>",
]

Loading…
Cancel
Save