community[minor]: Add graph store extractors (#24065)

This adds an extractor interface and an implementation for HTML pages. Extractors are used to create GraphVectorStore Links on loaded content. **Twitter handle:** cbornet_
2024-11-10 01:10:59 +00:00 · 2024-07-11 16:35:31 +02:00 · 2024-07-11 16:35:31 +02:00 · 5fc5ef2b52
commit 5fc5ef2b52
parent 9bcf8f867d
7 changed files with 321 additions and 0 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/init.py
@ -0,0 +1,17 @@
+from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
+    HtmlInput,
+    HtmlLinkExtractor,
+)
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
+    LinkExtractorAdapter,
+)
+
+__all__ = [
+    "LinkExtractor",
+    "LinkExtractorAdapter",
+    "HtmlInput",
+    "HtmlLinkExtractor",
+]
--- a/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/html_link_extractor.py
@ -0,0 +1,124 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Set, Union
+from urllib.parse import urldefrag, urljoin, urlparse
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores import Link
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
+    LinkExtractorAdapter,
+)
+
+if TYPE_CHECKING:
+    from bs4 import BeautifulSoup
+    from bs4.element import Tag
+
+
+def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
+    href = link.get("href")
+    if href is None:
+        return None
+    url = urlparse(href)
+    if url.scheme not in ["http", "https", ""]:
+        return None
+
+    # Join the HREF with the page_url to convert relative paths to absolute.
+    url = str(urljoin(page_url, href))
+
+    # Fragments would be useful if we chunked a page based on section.
+    # Then, each chunk would have a different URL based on the fragment.
+    # Since we aren't doing that yet, they just "break" links. So, drop
+    # the fragment.
+    if drop_fragments:
+        return urldefrag(url).url
+    return url
+
+
+def _parse_hrefs(
+    soup: BeautifulSoup, url: str, drop_fragments: bool = True
+) -> Set[str]:
+    soup_links: List[Tag] = soup.find_all("a")
+    links: Set[str] = set()
+
+    for link in soup_links:
+        parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
+        # Remove self links and entries for any 'a' tag that failed to parse
+        # (didn't have href, or invalid domain, etc.)
+        if parse_url and parse_url != url:
+            links.add(parse_url)
+
+    return links
+
+
+@dataclass
+class HtmlInput:
+    content: Union[str, BeautifulSoup]
+    base_url: str
+
+
+class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
+    def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
+        """Extract hyperlinks from HTML content.
+
+        Expects the input to be an HTML string or a `BeautifulSoup` object.
+
+        Args:
+            kind: The kind of edge to extract. Defaults to "hyperlink".
+            drop_fragments: Whether fragments in URLs and links should be
+                dropped. Defaults to `True`.
+        """
+        try:
+            import bs4  # noqa:F401
+        except ImportError as e:
+            raise ImportError(
+                "BeautifulSoup4 is required for HtmlLinkExtractor. "
+                "Please install it with `pip install beautifulsoup4`."
+            ) from e
+
+        self._kind = kind
+        self.drop_fragments = drop_fragments
+
+    def as_document_extractor(
+        self, url_metadata_key: str = "source"
+    ) -> LinkExtractor[Document]:
+        """Return a LinkExtractor that applies to documents.
+
+        NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
+        link extractors it may be more efficient to call the link extractors directly
+        on the parsed BeautifulSoup object.
+
+        Args:
+            url_metadata_key: The name of the filed in document metadata with the URL of
+                the document.
+        """
+        return LinkExtractorAdapter(
+            underlying=self,
+            transform=lambda doc: HtmlInput(
+                doc.page_content, doc.metadata[url_metadata_key]
+            ),
+        )
+
+    def extract_one(
+        self,
+        input: HtmlInput,  # noqa: A002
+    ) -> Set[Link]:
+        content = input.content
+        if isinstance(content, str):
+            from bs4 import BeautifulSoup
+
+            content = BeautifulSoup(content, "html.parser")
+
+        base_url = input.base_url
+        if self.drop_fragments:
+            base_url = urldefrag(base_url).url
+
+        hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
+
+        links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
+        links.add(Link.incoming(kind=self._kind, tag=base_url))
+        return links
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
@ -0,0 +1,36 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Generic, Iterable, Set, TypeVar
+
+from langchain_core.graph_vectorstores import Link
+
+InputT = TypeVar("InputT")
+
+METADATA_LINKS_KEY = "links"
+
+
+class LinkExtractor(ABC, Generic[InputT]):
+    """Interface for extracting links (incoming, outgoing, bidirectional)."""
+
+    @abstractmethod
+    def extract_one(self, input: InputT) -> set[Link]:  # noqa: A002
+        """Add edges from each `input` to the corresponding documents.
+
+        Args:
+            input: The input content to extract edges from.
+
+        Returns:
+            Set of links extracted from the input.
+        """
+
+    def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
+        """Add edges from each `input` to the corresponding documents.
+
+        Args:
+            inputs: The input content to extract edges from.
+
+        Returns:
+            Iterable over the set of links extracted from the input.
+        """
+        return map(self.extract_one, inputs)
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_adapter.py
@ -0,0 +1,27 @@
+from typing import Callable, Iterable, Set, TypeVar
+
+from langchain_core.graph_vectorstores import Link
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+
+InputT = TypeVar("InputT")
+UnderlyingInputT = TypeVar("UnderlyingInputT")
+
+
+class LinkExtractorAdapter(LinkExtractor[InputT]):
+    def __init__(
+        self,
+        underlying: LinkExtractor[UnderlyingInputT],
+        transform: Callable[[InputT], UnderlyingInputT],
+    ) -> None:
+        self._underlying = underlying
+        self._transform = transform
+
+    def extract_one(self, input: InputT) -> Set[Link]:  # noqa: A002
+        return self._underlying.extract_one(self._transform(input))
+
+    def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
+        underlying_inputs = map(self._transform, inputs)
+        return self._underlying.extract_many(underlying_inputs)
--- a/libs/community/tests/unit_tests/graph_vectorstores/init.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/init.py
--- a/libs/community/tests/unit_tests/graph_vectorstores/extractors/init.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/init.py
--- a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_html_link_extractor.py
@ -0,0 +1,117 @@
+import pytest
+from langchain_core.graph_vectorstores import Link
+
+from langchain_community.graph_vectorstores.extractors import (
+    HtmlInput,
+    HtmlLinkExtractor,
+)
+
+PAGE_1 = """
+<html>
+<body>
+Hello.
+<a href="relative">Relative</a>
+<a href="/relative-base">Relative base.</a>
+<a href="http://cnn.com">Aboslute</a>
+<a href="//same.foo">Test</a>
+</body>
+</html>
+"""
+
+PAGE_2 = """
+<html>
+<body>
+Hello.
+<a href="/bar/#fragment">Relative</a>
+</html>
+"""
+
+
+@pytest.mark.requires("bs4")
+def test_one_from_str() -> None:
+    extractor = HtmlLinkExtractor()
+
+    results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
+    assert results == {
+        Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
+        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
+        Link.outgoing(kind="hyperlink", tag="https://same.foo"),
+    }
+
+    results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
+    assert results == {
+        Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
+        Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
+        Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
+        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
+        Link.outgoing(kind="hyperlink", tag="http://same.foo"),
+    }
+
+
+@pytest.mark.requires("bs4")
+def test_one_from_beautiful_soup() -> None:
+    from bs4 import BeautifulSoup
+
+    extractor = HtmlLinkExtractor()
+    soup = BeautifulSoup(PAGE_1, "html.parser")
+    results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
+    assert results == {
+        Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
+        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
+        Link.outgoing(kind="hyperlink", tag="https://same.foo"),
+    }
+
+
+@pytest.mark.requires("bs4")
+def test_drop_fragments() -> None:
+    extractor = HtmlLinkExtractor(drop_fragments=True)
+    results = extractor.extract_one(
+        HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
+    )
+
+    assert results == {
+        Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
+    }
+
+
+@pytest.mark.requires("bs4")
+def test_include_fragments() -> None:
+    extractor = HtmlLinkExtractor(drop_fragments=False)
+    results = extractor.extract_one(
+        HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
+    )
+
+    assert results == {
+        Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
+    }
+
+
+@pytest.mark.requires("bs4")
+def test_batch_from_str() -> None:
+    extractor = HtmlLinkExtractor()
+    results = list(
+        extractor.extract_many(
+            [
+                HtmlInput(PAGE_1, base_url="https://foo.com/bar/"),
+                HtmlInput(PAGE_2, base_url="https://foo.com/baz/"),
+            ]
+        )
+    )
+
+    assert results[0] == {
+        Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
+        Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
+        Link.outgoing(kind="hyperlink", tag="https://same.foo"),
+    }
+    assert results[1] == {
+        Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
+        Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
+    }