from langchain_core.utils.html import ( PREFIXES_TO_IGNORE, SUFFIXES_TO_IGNORE, extract_sub_links, find_all_links, ) def test_find_all_links_none() -> None: html = "Hello world" actual = find_all_links(html) assert actual == [] def test_find_all_links_single() -> None: htmls = [ "href=''", 'href=""', '
', ] actual = [find_all_links(html) for html in htmls] assert actual == [[""]] * 3 def test_find_all_links_multiple() -> None: html = ( '
' '
' ) actual = find_all_links(html) assert sorted(actual) == [ "/baz/cool", "", ] def test_find_all_links_ignore_suffix() -> None: html = 'href="foobar{suffix}"' for suffix in SUFFIXES_TO_IGNORE: actual = find_all_links(html.format(suffix=suffix)) assert actual == [] # Don't ignore if pattern doesn't occur at end of link. html = 'href="foobar{suffix}more"' for suffix in SUFFIXES_TO_IGNORE: actual = find_all_links(html.format(suffix=suffix)) assert actual == [f"foobar{suffix}more"] def test_find_all_links_ignore_prefix() -> None: html = 'href="{prefix}foobar"' for prefix in PREFIXES_TO_IGNORE: actual = find_all_links(html.format(prefix=prefix)) assert actual == [] # Don't ignore if pattern doesn't occur at beginning of link. html = 'href="foobar{prefix}more"' for prefix in PREFIXES_TO_IGNORE: # Pound signs are split on when not prefixes. if prefix == "#": continue actual = find_all_links(html.format(prefix=prefix)) assert actual == [f"foobar{prefix}more"] def test_find_all_links_drop_fragment() -> None: html = 'href=""' actual = find_all_links(html) assert actual == [""] def test_extract_sub_links() -> None: html = ( 'one' 'two' 'three' 'four' ) expected = sorted( [ "", "", "", ] ) actual = sorted(extract_sub_links(html, "")) assert actual == expected actual = extract_sub_links(html, "") expected = [""] assert actual == expected actual = sorted( extract_sub_links(html, "", prevent_outside=False) ) expected = sorted( [ "", "", "", "", ] ) assert actual == expected def test_extract_sub_links_base() -> None: html = ( 'one' 'two' 'three' 'four' '' ) expected = sorted( [ "", "", "", "", ] ) actual = sorted( extract_sub_links( html, "", base_url="" ) ) assert actual == expected def test_extract_sub_links_exclude() -> None: html = ( 'one' 'two' 'three' 'four' '' ) expected = sorted( [ "", "", "", "", ] ) actual = sorted( extract_sub_links( html, "", base_url="", prevent_outside=False, exclude_prefixes=("", ""), ) ) assert actual == expected def test_prevent_outside() -> None: """Test that prevent outside compares against full base URL.""" html = ( 'BAD' 'BAD' 'BAD' 'BAD' 'OK' 'BAD' # Change in scheme is not OK here ) expected = sorted( [ "", ] ) actual = sorted( extract_sub_links( html, "", base_url="", prevent_outside=True, ) ) assert actual == expected