mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
community[minor]: Add graph store extractors (#24065)
This adds an extractor interface and an implementation for HTML pages. Extractors are used to create GraphVectorStore Links on loaded content. **Twitter handle:** cbornet_
This commit is contained in:
parent
9bcf8f867d
commit
5fc5ef2b52
@ -0,0 +1,17 @@
|
||||
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
||||
HtmlInput,
|
||||
HtmlLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||
LinkExtractorAdapter,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
"HtmlInput",
|
||||
"HtmlLinkExtractor",
|
||||
]
|
@ -0,0 +1,124 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, List, Optional, Set, Union
|
||||
from urllib.parse import urldefrag, urljoin, urlparse
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||
LinkExtractorAdapter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
|
||||
|
||||
def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
|
||||
href = link.get("href")
|
||||
if href is None:
|
||||
return None
|
||||
url = urlparse(href)
|
||||
if url.scheme not in ["http", "https", ""]:
|
||||
return None
|
||||
|
||||
# Join the HREF with the page_url to convert relative paths to absolute.
|
||||
url = str(urljoin(page_url, href))
|
||||
|
||||
# Fragments would be useful if we chunked a page based on section.
|
||||
# Then, each chunk would have a different URL based on the fragment.
|
||||
# Since we aren't doing that yet, they just "break" links. So, drop
|
||||
# the fragment.
|
||||
if drop_fragments:
|
||||
return urldefrag(url).url
|
||||
return url
|
||||
|
||||
|
||||
def _parse_hrefs(
|
||||
soup: BeautifulSoup, url: str, drop_fragments: bool = True
|
||||
) -> Set[str]:
|
||||
soup_links: List[Tag] = soup.find_all("a")
|
||||
links: Set[str] = set()
|
||||
|
||||
for link in soup_links:
|
||||
parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
|
||||
# Remove self links and entries for any 'a' tag that failed to parse
|
||||
# (didn't have href, or invalid domain, etc.)
|
||||
if parse_url and parse_url != url:
|
||||
links.add(parse_url)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlInput:
|
||||
content: Union[str, BeautifulSoup]
|
||||
base_url: str
|
||||
|
||||
|
||||
class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
|
||||
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
|
||||
"""Extract hyperlinks from HTML content.
|
||||
|
||||
Expects the input to be an HTML string or a `BeautifulSoup` object.
|
||||
|
||||
Args:
|
||||
kind: The kind of edge to extract. Defaults to "hyperlink".
|
||||
drop_fragments: Whether fragments in URLs and links should be
|
||||
dropped. Defaults to `True`.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"BeautifulSoup4 is required for HtmlLinkExtractor. "
|
||||
"Please install it with `pip install beautifulsoup4`."
|
||||
) from e
|
||||
|
||||
self._kind = kind
|
||||
self.drop_fragments = drop_fragments
|
||||
|
||||
def as_document_extractor(
|
||||
self, url_metadata_key: str = "source"
|
||||
) -> LinkExtractor[Document]:
|
||||
"""Return a LinkExtractor that applies to documents.
|
||||
|
||||
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
|
||||
link extractors it may be more efficient to call the link extractors directly
|
||||
on the parsed BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
url_metadata_key: The name of the filed in document metadata with the URL of
|
||||
the document.
|
||||
"""
|
||||
return LinkExtractorAdapter(
|
||||
underlying=self,
|
||||
transform=lambda doc: HtmlInput(
|
||||
doc.page_content, doc.metadata[url_metadata_key]
|
||||
),
|
||||
)
|
||||
|
||||
def extract_one(
|
||||
self,
|
||||
input: HtmlInput, # noqa: A002
|
||||
) -> Set[Link]:
|
||||
content = input.content
|
||||
if isinstance(content, str):
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
content = BeautifulSoup(content, "html.parser")
|
||||
|
||||
base_url = input.base_url
|
||||
if self.drop_fragments:
|
||||
base_url = urldefrag(base_url).url
|
||||
|
||||
hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
|
||||
|
||||
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
|
||||
links.add(Link.incoming(kind=self._kind, tag=base_url))
|
||||
return links
|
@ -0,0 +1,36 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Generic, Iterable, Set, TypeVar
|
||||
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
InputT = TypeVar("InputT")
|
||||
|
||||
METADATA_LINKS_KEY = "links"
|
||||
|
||||
|
||||
class LinkExtractor(ABC, Generic[InputT]):
|
||||
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
||||
|
||||
@abstractmethod
|
||||
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
|
||||
"""Add edges from each `input` to the corresponding documents.
|
||||
|
||||
Args:
|
||||
input: The input content to extract edges from.
|
||||
|
||||
Returns:
|
||||
Set of links extracted from the input.
|
||||
"""
|
||||
|
||||
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
|
||||
"""Add edges from each `input` to the corresponding documents.
|
||||
|
||||
Args:
|
||||
inputs: The input content to extract edges from.
|
||||
|
||||
Returns:
|
||||
Iterable over the set of links extracted from the input.
|
||||
"""
|
||||
return map(self.extract_one, inputs)
|
@ -0,0 +1,27 @@
|
||||
from typing import Callable, Iterable, Set, TypeVar
|
||||
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
|
||||
InputT = TypeVar("InputT")
|
||||
UnderlyingInputT = TypeVar("UnderlyingInputT")
|
||||
|
||||
|
||||
class LinkExtractorAdapter(LinkExtractor[InputT]):
|
||||
def __init__(
|
||||
self,
|
||||
underlying: LinkExtractor[UnderlyingInputT],
|
||||
transform: Callable[[InputT], UnderlyingInputT],
|
||||
) -> None:
|
||||
self._underlying = underlying
|
||||
self._transform = transform
|
||||
|
||||
def extract_one(self, input: InputT) -> Set[Link]: # noqa: A002
|
||||
return self._underlying.extract_one(self._transform(input))
|
||||
|
||||
def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
|
||||
underlying_inputs = map(self._transform, inputs)
|
||||
return self._underlying.extract_many(underlying_inputs)
|
@ -0,0 +1,117 @@
|
||||
import pytest
|
||||
from langchain_core.graph_vectorstores import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors import (
|
||||
HtmlInput,
|
||||
HtmlLinkExtractor,
|
||||
)
|
||||
|
||||
PAGE_1 = """
|
||||
<html>
|
||||
<body>
|
||||
Hello.
|
||||
<a href="relative">Relative</a>
|
||||
<a href="/relative-base">Relative base.</a>
|
||||
<a href="http://cnn.com">Aboslute</a>
|
||||
<a href="//same.foo">Test</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
PAGE_2 = """
|
||||
<html>
|
||||
<body>
|
||||
Hello.
|
||||
<a href="/bar/#fragment">Relative</a>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_one_from_str() -> None:
|
||||
extractor = HtmlLinkExtractor()
|
||||
|
||||
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="https://foo.com/bar/"))
|
||||
assert results == {
|
||||
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
|
||||
}
|
||||
|
||||
results = extractor.extract_one(HtmlInput(PAGE_1, base_url="http://foo.com/bar/"))
|
||||
assert results == {
|
||||
Link.incoming(kind="hyperlink", tag="http://foo.com/bar/"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://foo.com/bar/relative"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://foo.com/relative-base"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://same.foo"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_one_from_beautiful_soup() -> None:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
extractor = HtmlLinkExtractor()
|
||||
soup = BeautifulSoup(PAGE_1, "html.parser")
|
||||
results = extractor.extract_one(HtmlInput(soup, base_url="https://foo.com/bar/"))
|
||||
assert results == {
|
||||
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_drop_fragments() -> None:
|
||||
extractor = HtmlLinkExtractor(drop_fragments=True)
|
||||
results = extractor.extract_one(
|
||||
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
|
||||
)
|
||||
|
||||
assert results == {
|
||||
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_include_fragments() -> None:
|
||||
extractor = HtmlLinkExtractor(drop_fragments=False)
|
||||
results = extractor.extract_one(
|
||||
HtmlInput(PAGE_2, base_url="https://foo.com/baz/#fragment")
|
||||
)
|
||||
|
||||
assert results == {
|
||||
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/#fragment"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/#fragment"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4")
|
||||
def test_batch_from_str() -> None:
|
||||
extractor = HtmlLinkExtractor()
|
||||
results = list(
|
||||
extractor.extract_many(
|
||||
[
|
||||
HtmlInput(PAGE_1, base_url="https://foo.com/bar/"),
|
||||
HtmlInput(PAGE_2, base_url="https://foo.com/baz/"),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
assert results[0] == {
|
||||
Link.incoming(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/relative"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/relative-base"),
|
||||
Link.outgoing(kind="hyperlink", tag="http://cnn.com"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://same.foo"),
|
||||
}
|
||||
assert results[1] == {
|
||||
Link.incoming(kind="hyperlink", tag="https://foo.com/baz/"),
|
||||
Link.outgoing(kind="hyperlink", tag="https://foo.com/bar/"),
|
||||
}
|
Loading…
Reference in New Issue
Block a user