community[minor]: add document transformer for extracting links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more
`LinkExtractor`s and adding the extracted links to each document.
- **Issue:** n/a
- **Depedencies:** none

---------

Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
Ben Chambers 2024-07-22 19:01:21 -07:00 committed by GitHub
parent 3c4652c906
commit 5ac936a284
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 174 additions and 8 deletions

View File

@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor
HierarchyInput,
HierarchyLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
KeybertInput,
KeybertLinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor import (
from .html_link_extractor import (
HtmlInput,
HtmlLinkExtractor,
)
from .link_extractor import (
LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
from .link_extractor_adapter import (
LinkExtractorAdapter,
)
from .link_extractor_transformer import (
LinkExtractorTransformer,
)
__all__ = [
"GLiNERInput",
@ -34,4 +38,5 @@ __all__ = [
"LinkExtractor",
"LinkExtractorAdapter",
"LinkExtractorAdapter",
"LinkExtractorTransformer",
]

View File

@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
@abstractmethod
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
def extract_one(self, input: InputT) -> Set[Link]:
"""Add edges from each `input` to the corresponding documents.
Args:

View File

@ -0,0 +1,43 @@
from typing import Any, Iterable, Sequence
from langchain_core.documents import Document
from langchain_core.documents.transformers import BaseDocumentTransformer
from langchain_core.graph_vectorstores.links import copy_with_links
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
class LinkExtractorTransformer(BaseDocumentTransformer):
"""DocumentTransformer for applying one or more LinkExtractors.
Example:
.. code-block:: python
extract_links = LinkExtractorTransformer([
HtmlLinkExtractor().as_document_extractor(),
])
extract_links.transform_documents(docs)
"""
def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
"""Create a DocumentTransformer which adds extracted links to each document."""
self.link_extractors = link_extractors
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
# Implement `transform_docments` directly, so that LinkExtractors which operate
# better in batch (`extract_many`) get a chance to do so.
# Run each extractor over all documents.
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
# Transpose the list of lists to pair each document with the tuple of links.
links_per_document = zip(*links_per_extractor)
return [
copy_with_links(document, *links)
for document, links in zip(documents, links_per_document)
]

View File

@ -0,0 +1,92 @@
from typing import Set
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link, get_links
from langchain_community.graph_vectorstores.extractors import (
LinkExtractor,
LinkExtractorTransformer,
)
TEXT1 = "Text1"
TEXT2 = "Text2"
class FakeKeywordExtractor(LinkExtractor[Document]):
def extract_one(self, input: Document) -> Set[Link]:
kws: Set[str] = set()
if input.page_content == TEXT1:
kws = {"a", "b"}
elif input.page_content == TEXT2:
kws = {"b", "c"}
return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
class FakeHyperlinkExtractor(LinkExtractor[Document]):
def extract_one(self, input: Document) -> Set[Link]:
if input.page_content == TEXT1:
return {
Link.incoming(kind="fakehref", tag="http://text1"),
Link.outgoing(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
elif input.page_content == TEXT2:
return {
Link.incoming(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
else:
raise ValueError(
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
)
def test_one_extractor() -> None:
transformer = LinkExtractorTransformer(
[
FakeKeywordExtractor(),
]
)
doc1 = Document(TEXT1)
doc2 = Document(TEXT2)
results = transformer.transform_documents([doc1, doc2])
assert set(get_links(results[0])) == {
Link.bidir(kind="fakekw", tag="a"),
Link.bidir(kind="fakekw", tag="b"),
}
assert set(get_links(results[1])) == {
Link.bidir(kind="fakekw", tag="b"),
Link.bidir(kind="fakekw", tag="c"),
}
def test_multiple_extractors() -> None:
transformer = LinkExtractorTransformer(
[
FakeKeywordExtractor(),
FakeHyperlinkExtractor(),
]
)
doc1 = Document(TEXT1)
doc2 = Document(TEXT2)
results = transformer.transform_documents([doc1, doc2])
assert set(get_links(results[0])) == {
Link.bidir(kind="fakekw", tag="a"),
Link.bidir(kind="fakekw", tag="b"),
Link.incoming(kind="fakehref", tag="http://text1"),
Link.outgoing(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}
assert set(get_links(results[1])) == {
Link.bidir(kind="fakekw", tag="b"),
Link.bidir(kind="fakekw", tag="c"),
Link.incoming(kind="fakehref", tag="http://text2"),
Link.outgoing(kind="fakehref", tag="http://text3"),
}

View File

@ -12,7 +12,7 @@ class Link:
"""
kind: str
"""The kind of link. Allows different extractors to use the same tag name without
"""The kind of link. Allows different extractors to use the same tag name without
creating collisions between extractors. For example keyword vs url."""
direction: Literal["in", "out", "bidir"]
"""The direction of the link."""
@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
links_in_metadata.extend(link)
else:
links_in_metadata.append(link)
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
"""Return a document with the given links added.
Args:
doc: The document to add the links to.
*links: The links to add to the document.
Returns:
A document with a shallow-copy of the metadata with the links added.
"""
new_links = set(get_links(doc))
for link in links:
if isinstance(link, Iterable):
new_links.update(link)
else:
new_links.add(link)
return Document(
page_content=doc.page_content,
metadata={
**doc.metadata,
METADATA_LINKS_KEY: list(new_links),
},
)