community[minor]: Add graph store extractors (#24065)

This adds an extractor interface and an implementation for HTML pages.
Extractors are used to create GraphVectorStore Links on loaded content.

**Twitter handle:** cbornet_
This commit is contained in:
Christophe Bornet 2024-07-11 16:35:31 +02:00 committed by GitHub
parent 9bcf8f867d
commit 5fc5ef2b52
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 321 additions and 0 deletions

View File

@ -0,0 +1,17 @@
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
LinkExtractorAdapter,
)
__all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"HtmlInput",
"HtmlLinkExtractor",
]

View File

@ -0,0 +1,124 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse
from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
LinkExtractorAdapter,
)
if TYPE_CHECKING:
from bs4 import BeautifulSoup
from bs4.element import Tag
def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
href = link.get("href")
if href is None:
return None
url = urlparse(href)
if url.scheme not in ["http", "https", ""]:
return None
# Join the HREF with the page_url to convert relative paths to absolute.
url = str(urljoin(page_url, href))
# Fragments would be useful if we chunked a page based on section.
# Then, each chunk would have a different URL based on the fragment.
# Since we aren't doing that yet, they just "break" links. So, drop
# the fragment.
if drop_fragments:
return urldefrag(url).url
return url
def _parse_hrefs(
soup: BeautifulSoup, url: str, drop_fragments: bool = True
) -> Set[str]:
soup_links: List[Tag] = soup.find_all("a")
links: Set[str] = set()
for link in soup_links:
parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
# Remove self links and entries for any 'a' tag that failed to parse
# (didn't have href, or invalid domain, etc.)
if parse_url and parse_url != url:
links.add(parse_url)
return links
@dataclass
class HtmlInput:
content: Union[str, BeautifulSoup]
base_url: str
class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
"""Extract hyperlinks from HTML content.
Expects the input to be an HTML string or a `BeautifulSoup` object.
Args:
kind: The kind of edge to extract. Defaults to "hyperlink".
drop_fragments: Whether fragments in URLs and links should be
dropped. Defaults to `True`.
"""
try:
import bs4 # noqa:F401
except ImportError as e:
raise ImportError(
"BeautifulSoup4 is required for HtmlLinkExtractor. "
"Please install it with `pip install beautifulsoup4`."
) from e
self._kind = kind
self.drop_fragments = drop_fragments
def as_document_extractor(
self, url_metadata_key: str = "source"
) -> LinkExtractor[Document]:
"""Return a LinkExtractor that applies to documents.
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
link extractors it may be more efficient to call the link extractors directly
on the parsed BeautifulSoup object.
Args:
url_metadata_key: The name of the filed in document metadata with the URL of
the document.
"""
return LinkExtractorAdapter(
underlying=self,
transform=lambda doc: HtmlInput(
doc.page_content, doc.metadata[url_metadata_key]
),
)
def extract_one(
self,
input: HtmlInput, # noqa: A002
) -> Set[Link]:
content = input.content
if isinstance(content, str):
from bs4 import BeautifulSoup
content = BeautifulSoup(content, "html.parser")
base_url = input.base_url
if self.drop_fragments:
base_url = urldefrag(base_url).url
hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
links.add(Link.incoming(kind=self._kind, tag=base_url))
return links

View File

@ -0,0 +1,36 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Generic, Iterable, Set, TypeVar
from langchain_core.graph_vectorstores import Link
InputT = TypeVar("InputT")
METADATA_LINKS_KEY = "links"
class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
@abstractmethod
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
"""Add edges from each `input` to the corresponding documents.
Args:
input: The input content to extract edges from.
Returns:
Set of links extracted from the input.
"""
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
"""Add edges from each `input` to the corresponding documents.
Args:
inputs: The input content to extract edges from.
Returns:
Iterable over the set of links extracted from the input.
"""
return map(self.extract_one, inputs)

View File

@ -0,0 +1,27 @@
from typing import Callable, Iterable, Set, TypeVar
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
InputT = TypeVar("InputT")
UnderlyingInputT = TypeVar("UnderlyingInputT")
class LinkExtractorAdapter(LinkExtractor[InputT]):
def __init__(
self,
underlying: LinkExtractor[UnderlyingInputT],
transform: Callable[[InputT], UnderlyingInputT],
) -> None:
self._underlying = underlying
self._transform = transform
def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002
return self._underlying.extract_one(self._transform(input))
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
underlying_inputs = map(self._transform, inputs)
return self._underlying.extract_many(underlying_inputs)

View File

@ -0,0 +1,117 @@
import pytest
from langchain_core.graph_vectorstores import Link
from langchain_community.graph_vectorstores.extractors import (
HtmlInput,
HtmlLinkExtractor,
)
PAGE_1 = """
<html>
<body>
Hello.
<a href="relative">Relative</a>
<a href="/relative-base">Relative base.</a>
<a href="http://cnn.com">Aboslute</a>
<a href="//same.foo">Test</a>
</body>
</html>
"""
PAGE_2 = """
<html>
<body>
Hello.
<a href="/bar/#fragment">Relative</a>
</html>
"""
@pytest.mark.requires("bs4")
def test_one_from_str() -> None:
extractor = HtmlLinkExtractor()
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
}
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="http://same.foo"),
}
@pytest.mark.requires("bs4")
def test_one_from_beautiful_soup() -> None:
from bs4 import BeautifulSoup
extractor = HtmlLinkExtractor()
soup = BeautifulSoup(PAGE_1, "html.parser")
results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
}
@pytest.mark.requires("bs4")
def test_drop_fragments() -> None:
extractor = HtmlLinkExtractor(drop_fragments=True)
results = extractor.extract_one(
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
)
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
}
@pytest.mark.requires("bs4")
def test_include_fragments() -> None:
extractor = HtmlLinkExtractor(drop_fragments=False)
results = extractor.extract_one(
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
)
assert results == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
}
@pytest.mark.requires("bs4")
def test_batch_from_str() -> None:
extractor = HtmlLinkExtractor()
results = list(
extractor.extract_many(
[
HtmlInput(PAGE_1, base_url="https://foo.com/bar/"),
HtmlInput(PAGE_2, base_url="https://foo.com/baz/"),
]
)
)
assert results[0] == {
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
}
assert results[1] == {
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
}