Attribute support for html tags (#5782)

# What does this PR do?

Change the HTML tags so that a tag with attributes can be found.

## Before submitting

- [x] Tests added
- [x] CI/CD validated

### Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.
searx_updates
Yoann Poupart 11 months ago committed by GitHub
parent 0cfaa76e45
commit 65111eb2b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -740,33 +740,33 @@ class RecursiveCharacterTextSplitter(TextSplitter):
elif language == Language.HTML:
return [
# First, try to split along HTML tags
"<body>",
"<div>",
"<p>",
"<br>",
"<li>",
"<h1>",
"<h2>",
"<h3>",
"<h4>",
"<h5>",
"<h6>",
"<span>",
"<table>",
"<tr>",
"<td>",
"<th>",
"<ul>",
"<ol>",
"<header>",
"<footer>",
"<nav>",
"<body",
"<div",
"<p",
"<br",
"<li",
"<h1",
"<h2",
"<h3",
"<h4",
"<h5",
"<h6",
"<span",
"<table",
"<tr",
"<td",
"<th",
"<ul",
"<ol",
"<header",
"<footer",
"<nav",
# Head
"<head>",
"<style>",
"<script>",
"<meta>",
"<title>",
"<head",
"<style",
"<script",
"<meta",
"<title",
"",
]
else:

@ -576,3 +576,39 @@ This is a code block
"block",
"```",
]
def test_html_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.HTML, chunk_size=60, chunk_overlap=0
)
code = """
<h1>Sample Document</h1>
<h2>Section</h2>
<p id="1234">Reference content.</p>
<h2>Lists</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
<h3>A block</h3>
<div class="amazing">
<p>Some text</p>
<p>Some more text</p>
</div>
"""
chunks = splitter.split_text(code)
assert chunks == [
"<h1>Sample Document</h1>\n <h2>Section</h2>",
'<p id="1234">Reference content.</p>',
"<h2>Lists</h2>\n <ul>",
"<li>Item 1</li>\n <li>Item 2</li>",
"<li>Item 3</li>\n </ul>",
"<h3>A block</h3>",
'<div class="amazing">',
"<p>Some text</p>",
"<p>Some more text</p>\n </div>",
]

Loading…
Cancel
Save