mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
Merge pull request #24315
* community: Add Hierarchy link extractor * add example * lint
This commit is contained in:
parent
c3308f31bc
commit
242b085be7
@ -1,3 +1,7 @@
|
||||
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
|
||||
HierarchyInput,
|
||||
HierarchyLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
||||
HtmlInput,
|
||||
HtmlLinkExtractor,
|
||||
@ -12,6 +16,8 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
|
||||
__all__ = [
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
"HierarchyInput",
|
||||
"HierarchyLinkExtractor",
|
||||
"HtmlInput",
|
||||
"HtmlLinkExtractor",
|
||||
]
|
||||
|
@ -0,0 +1,106 @@
|
||||
from typing import Callable, List, Set
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||
LinkExtractorAdapter,
|
||||
)
|
||||
|
||||
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
|
||||
HierarchyInput = List[str]
|
||||
|
||||
_PARENT: str = "p:"
|
||||
_CHILD: str = "c:"
|
||||
_SIBLING: str = "s:"
|
||||
|
||||
|
||||
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
kind: str = "hierarchy",
|
||||
parent_links: bool = True,
|
||||
child_links: bool = False,
|
||||
sibling_links: bool = False,
|
||||
):
|
||||
"""Extract links from a document hierarchy.
|
||||
|
||||
Example:
|
||||
.. code_block: python
|
||||
# Given three paths (in this case, within the "Root" document):
|
||||
h1 = ["Root", "H1"]
|
||||
h1a = ["Root", "H1", "a"]
|
||||
h1b = ["Root", "H1", "b"]
|
||||
|
||||
# Parent links `h1a` and `h1b` to `h1`.
|
||||
# Child links `h1` to `h1a` and `h1b`.
|
||||
# Sibling links `h1a` and `h1b` together (both directions).
|
||||
|
||||
Example use with documents:
|
||||
.. code_block: python
|
||||
transformer = LinkExtractorTransformer([
|
||||
HierarchyLinkExtractor().as_document_extractor(
|
||||
# Assumes the "path" to each document is in the metadata.
|
||||
# Could split strings, etc.
|
||||
lambda doc: doc.metadata.get("path", [])
|
||||
)
|
||||
])
|
||||
linked = transformer.transform_documents(docs)
|
||||
|
||||
Args:
|
||||
kind: Kind of links to produce with this extractor.
|
||||
parent_links: Link from a section to its parent.
|
||||
child_links: Link from a section to its children.
|
||||
sibling_links: Link from a section to other sections with the same parent.
|
||||
"""
|
||||
self._kind = kind
|
||||
self._parent_links = parent_links
|
||||
self._child_links = child_links
|
||||
self._sibling_links = sibling_links
|
||||
|
||||
def as_document_extractor(
|
||||
self, hierarchy: Callable[[Document], HierarchyInput]
|
||||
) -> LinkExtractor[Document]:
|
||||
"""Create a LinkExtractor from `Document`.
|
||||
|
||||
Args:
|
||||
hierarchy: Function that returns the path for the given document.
|
||||
|
||||
Returns:
|
||||
A `LinkExtractor[Document]` suitable for application to `Documents` directly
|
||||
or with `LinkExtractorTransformer`.
|
||||
"""
|
||||
return LinkExtractorAdapter(underlying=self, transform=hierarchy)
|
||||
|
||||
def extract_one(
|
||||
self,
|
||||
input: HierarchyInput,
|
||||
) -> Set[Link]:
|
||||
this_path = "/".join(input)
|
||||
parent_path = None
|
||||
|
||||
links = set()
|
||||
if self._parent_links:
|
||||
# This is linked from everything with this parent path.
|
||||
links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path))
|
||||
if self._child_links:
|
||||
# This is linked to every child with this as it's "parent" path.
|
||||
links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path))
|
||||
|
||||
if len(input) >= 1:
|
||||
parent_path = "/".join(input[0:-1])
|
||||
if self._parent_links and len(input) > 1:
|
||||
# This is linked to the nodes with the given parent path.
|
||||
links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path))
|
||||
if self._child_links and len(input) > 1:
|
||||
# This is linked from every node with the given parent path.
|
||||
links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path))
|
||||
if self._sibling_links:
|
||||
# This is a sibling of everything with the same parent.
|
||||
links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path))
|
||||
|
||||
return links
|
@ -0,0 +1,84 @@
|
||||
from langchain_core.graph_vectorstores.links import Link
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors import HierarchyLinkExtractor
|
||||
|
||||
PATH_1 = ["Root", "H1", "h2"]
|
||||
|
||||
PATH_2 = ["Root", "H1"]
|
||||
|
||||
PATH_3 = ["Root"]
|
||||
|
||||
|
||||
def test_up_only() -> None:
|
||||
extractor = HierarchyLinkExtractor()
|
||||
|
||||
assert extractor.extract_one(PATH_1) == {
|
||||
# Path1 links up to Root/H1
|
||||
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
|
||||
# Path1 is linked to by stuff under Root/H1/h2
|
||||
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
|
||||
}
|
||||
|
||||
assert extractor.extract_one(PATH_2) == {
|
||||
# Path2 links up to Root
|
||||
Link.outgoing(kind="hierarchy", tag="up:Root"),
|
||||
# Path2 is linked to by stuff under Root/H1/h2
|
||||
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
|
||||
}
|
||||
|
||||
assert extractor.extract_one(PATH_3) == {
|
||||
# Path3 is linked to by stuff under Root
|
||||
Link.incoming(kind="hierarchy", tag="up:Root"),
|
||||
}
|
||||
|
||||
|
||||
def test_up_and_down() -> None:
|
||||
extractor = HierarchyLinkExtractor(child_links=True)
|
||||
|
||||
assert extractor.extract_one(PATH_1) == {
|
||||
# Path1 links up to Root/H1
|
||||
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
|
||||
# Path1 is linked to by stuff under Root/H1/h2
|
||||
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
|
||||
# Path1 links down to things under Root/H1/h2.
|
||||
Link.outgoing(kind="hierarchy", tag="down:Root/H1/h2"),
|
||||
# Path1 is linked down to by Root/H1
|
||||
Link.incoming(kind="hierarchy", tag="down:Root/H1"),
|
||||
}
|
||||
|
||||
assert extractor.extract_one(PATH_2) == {
|
||||
# Path2 links up to Root
|
||||
Link.outgoing(kind="hierarchy", tag="up:Root"),
|
||||
# Path2 is linked to by stuff under Root/H1/h2
|
||||
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
|
||||
# Path2 links down to things under Root/H1.
|
||||
Link.outgoing(kind="hierarchy", tag="down:Root/H1"),
|
||||
# Path2 is linked down to by Root
|
||||
Link.incoming(kind="hierarchy", tag="down:Root"),
|
||||
}
|
||||
|
||||
assert extractor.extract_one(PATH_3) == {
|
||||
# Path3 is linked to by stuff under Root
|
||||
Link.incoming(kind="hierarchy", tag="up:Root"),
|
||||
# Path3 links down to things under Root/H1.
|
||||
Link.outgoing(kind="hierarchy", tag="down:Root"),
|
||||
}
|
||||
|
||||
|
||||
def test_sibling() -> None:
|
||||
extractor = HierarchyLinkExtractor(sibling_links=True, parent_links=False)
|
||||
|
||||
assert extractor.extract_one(PATH_1) == {
|
||||
# Path1 links with anything else in Root/H1
|
||||
Link.bidir(kind="hierarchy", tag="sib:Root/H1"),
|
||||
}
|
||||
|
||||
assert extractor.extract_one(PATH_2) == {
|
||||
# Path2 links with anything else in Root
|
||||
Link.bidir(kind="hierarchy", tag="sib:Root"),
|
||||
}
|
||||
|
||||
assert extractor.extract_one(PATH_3) == {
|
||||
# Path3 links with anything else at the top level
|
||||
Link.bidir(kind="hierarchy", tag="sib:"),
|
||||
}
|
Loading…
Reference in New Issue
Block a user