core[patch]: Enhance link extraction with query parameters (#20259)

**Description**: This update enhances the `extract_sub_links` function
within the `langchain_core/utils/html.py` module to include query
parameters in the extracted URLs.

**Issue**: N/A

**Dependencies**: No additional dependencies required for this change.

**Twitter handle**: N/A

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
pull/20985/head
YH 2 weeks ago committed by GitHub
parent 0e917e319b
commit 2aca7fcdcf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -88,6 +88,8 @@ def extract_sub_links(
absolute_path = f"{parsed_url.scheme}:{link}"
else:
absolute_path = urljoin(url, parsed_link.path)
if parsed_link.query:
absolute_path += f"?{parsed_link.query}"
absolute_paths.add(absolute_path)
except Exception as e:
if continue_on_failure:

@ -183,3 +183,27 @@ def test_prevent_outside() -> None:
)
)
assert actual == expected
def test_extract_sub_links_with_query() -> None:
html = (
'<a href="https://foobar.com?query=123">one</a>'
'<a href="/hello?query=456">two</a>'
'<a href="//foobar.com/how/are/you?query=789">three</a>'
'<a href="doing?query=101112"></a>'
)
expected = sorted(
[
"https://foobar.com?query=123",
"https://foobar.com/hello?query=456",
"https://foobar.com/how/are/you?query=789",
"https://foobar.com/hello/doing?query=101112",
]
)
actual = sorted(
extract_sub_links(
html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com"
)
)
assert actual == expected, f"Expected {expected}, but got {actual}"

Loading…
Cancel
Save