Merge pull request #24315

* community: Add Hierarchy link extractor

* add example

* lint
This commit is contained in:
Ben Chambers 2024-07-19 06:42:26 -07:00 committed by GitHub
parent c3308f31bc
commit 242b085be7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 196 additions and 0 deletions

View File

@ -1,3 +1,7 @@
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
HierarchyInput,
HierarchyLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
@ -12,6 +16,8 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
__all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"HierarchyInput",
"HierarchyLinkExtractor",
"HtmlInput",
"HtmlLinkExtractor",
]

View File

@ -0,0 +1,106 @@
from typing import Callable, List, Set
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
LinkExtractorAdapter,
)
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
HierarchyInput = List[str]
_PARENT: str = "p:"
_CHILD: str = "c:"
_SIBLING: str = "s:"
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
def __init__(
self,
*,
kind: str = "hierarchy",
parent_links: bool = True,
child_links: bool = False,
sibling_links: bool = False,
):
"""Extract links from a document hierarchy.
Example:
.. code_block: python
# Given three paths (in this case, within the "Root" document):
h1 = ["Root", "H1"]
h1a = ["Root", "H1", "a"]
h1b = ["Root", "H1", "b"]
# Parent links `h1a` and `h1b` to `h1`.
# Child links `h1` to `h1a` and `h1b`.
# Sibling links `h1a` and `h1b` together (both directions).
Example use with documents:
.. code_block: python
transformer = LinkExtractorTransformer([
HierarchyLinkExtractor().as_document_extractor(
# Assumes the "path" to each document is in the metadata.
# Could split strings, etc.
lambda doc: doc.metadata.get("path", [])
)
])
linked = transformer.transform_documents(docs)
Args:
kind: Kind of links to produce with this extractor.
parent_links: Link from a section to its parent.
child_links: Link from a section to its children.
sibling_links: Link from a section to other sections with the same parent.
"""
self._kind = kind
self._parent_links = parent_links
self._child_links = child_links
self._sibling_links = sibling_links
def as_document_extractor(
self, hierarchy: Callable[[Document], HierarchyInput]
) -> LinkExtractor[Document]:
"""Create a LinkExtractor from `Document`.
Args:
hierarchy: Function that returns the path for the given document.
Returns:
A `LinkExtractor[Document]` suitable for application to `Documents` directly
or with `LinkExtractorTransformer`.
"""
return LinkExtractorAdapter(underlying=self, transform=hierarchy)
def extract_one(
self,
input: HierarchyInput,
) -> Set[Link]:
this_path = "/".join(input)
parent_path = None
links = set()
if self._parent_links:
# This is linked from everything with this parent path.
links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path))
if self._child_links:
# This is linked to every child with this as it's "parent" path.
links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path))
if len(input) >= 1:
parent_path = "/".join(input[0:-1])
if self._parent_links and len(input) > 1:
# This is linked to the nodes with the given parent path.
links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path))
if self._child_links and len(input) > 1:
# This is linked from every node with the given parent path.
links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path))
if self._sibling_links:
# This is a sibling of everything with the same parent.
links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path))
return links

View File

@ -0,0 +1,84 @@
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors import HierarchyLinkExtractor
PATH_1 = ["Root", "H1", "h2"]
PATH_2 = ["Root", "H1"]
PATH_3 = ["Root"]
def test_up_only() -> None:
extractor = HierarchyLinkExtractor()
assert extractor.extract_one(PATH_1) == {
# Path1 links up to Root/H1
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
# Path1 is linked to by stuff under Root/H1/h2
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
}
assert extractor.extract_one(PATH_2) == {
# Path2 links up to Root
Link.outgoing(kind="hierarchy", tag="up:Root"),
# Path2 is linked to by stuff under Root/H1/h2
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
}
assert extractor.extract_one(PATH_3) == {
# Path3 is linked to by stuff under Root
Link.incoming(kind="hierarchy", tag="up:Root"),
}
def test_up_and_down() -> None:
extractor = HierarchyLinkExtractor(child_links=True)
assert extractor.extract_one(PATH_1) == {
# Path1 links up to Root/H1
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
# Path1 is linked to by stuff under Root/H1/h2
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
# Path1 links down to things under Root/H1/h2.
Link.outgoing(kind="hierarchy", tag="down:Root/H1/h2"),
# Path1 is linked down to by Root/H1
Link.incoming(kind="hierarchy", tag="down:Root/H1"),
}
assert extractor.extract_one(PATH_2) == {
# Path2 links up to Root
Link.outgoing(kind="hierarchy", tag="up:Root"),
# Path2 is linked to by stuff under Root/H1/h2
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
# Path2 links down to things under Root/H1.
Link.outgoing(kind="hierarchy", tag="down:Root/H1"),
# Path2 is linked down to by Root
Link.incoming(kind="hierarchy", tag="down:Root"),
}
assert extractor.extract_one(PATH_3) == {
# Path3 is linked to by stuff under Root
Link.incoming(kind="hierarchy", tag="up:Root"),
# Path3 links down to things under Root/H1.
Link.outgoing(kind="hierarchy", tag="down:Root"),
}
def test_sibling() -> None:
extractor = HierarchyLinkExtractor(sibling_links=True, parent_links=False)
assert extractor.extract_one(PATH_1) == {
# Path1 links with anything else in Root/H1
Link.bidir(kind="hierarchy", tag="sib:Root/H1"),
}
assert extractor.extract_one(PATH_2) == {
# Path2 links with anything else in Root
Link.bidir(kind="hierarchy", tag="sib:Root"),
}
assert extractor.extract_one(PATH_3) == {
# Path3 links with anything else at the top level
Link.bidir(kind="hierarchy", tag="sib:"),
}