mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
community[minor]: add document transformer for extracting links (#24186)
- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
This commit is contained in:
parent
3c4652c906
commit
5ac936a284
@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor
|
||||
HierarchyInput,
|
||||
HierarchyLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
||||
HtmlInput,
|
||||
HtmlLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
|
||||
KeybertInput,
|
||||
KeybertLinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
|
||||
from .html_link_extractor import (
|
||||
HtmlInput,
|
||||
HtmlLinkExtractor,
|
||||
)
|
||||
from .link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||
from .link_extractor_adapter import (
|
||||
LinkExtractorAdapter,
|
||||
)
|
||||
from .link_extractor_transformer import (
|
||||
LinkExtractorTransformer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"GLiNERInput",
|
||||
@ -34,4 +38,5 @@ __all__ = [
|
||||
"LinkExtractor",
|
||||
"LinkExtractorAdapter",
|
||||
"LinkExtractorAdapter",
|
||||
"LinkExtractorTransformer",
|
||||
]
|
||||
|
@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
|
||||
"""Interface for extracting links (incoming, outgoing, bidirectional)."""
|
||||
|
||||
@abstractmethod
|
||||
def extract_one(self, input: InputT) -> set[Link]: # noqa: A002
|
||||
def extract_one(self, input: InputT) -> Set[Link]:
|
||||
"""Add edges from each `input` to the corresponding documents.
|
||||
|
||||
Args:
|
||||
|
@ -0,0 +1,43 @@
|
||||
from typing import Any, Iterable, Sequence
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.documents.transformers import BaseDocumentTransformer
|
||||
from langchain_core.graph_vectorstores.links import copy_with_links
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||
LinkExtractor,
|
||||
)
|
||||
|
||||
|
||||
class LinkExtractorTransformer(BaseDocumentTransformer):
|
||||
"""DocumentTransformer for applying one or more LinkExtractors.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
extract_links = LinkExtractorTransformer([
|
||||
HtmlLinkExtractor().as_document_extractor(),
|
||||
])
|
||||
extract_links.transform_documents(docs)
|
||||
"""
|
||||
|
||||
def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
|
||||
"""Create a DocumentTransformer which adds extracted links to each document."""
|
||||
self.link_extractors = link_extractors
|
||||
|
||||
def transform_documents(
|
||||
self, documents: Sequence[Document], **kwargs: Any
|
||||
) -> Sequence[Document]:
|
||||
# Implement `transform_docments` directly, so that LinkExtractors which operate
|
||||
# better in batch (`extract_many`) get a chance to do so.
|
||||
|
||||
# Run each extractor over all documents.
|
||||
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
|
||||
|
||||
# Transpose the list of lists to pair each document with the tuple of links.
|
||||
links_per_document = zip(*links_per_extractor)
|
||||
|
||||
return [
|
||||
copy_with_links(document, *links)
|
||||
for document, links in zip(documents, links_per_document)
|
||||
]
|
@ -0,0 +1,92 @@
|
||||
from typing import Set
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.graph_vectorstores.links import Link, get_links
|
||||
|
||||
from langchain_community.graph_vectorstores.extractors import (
|
||||
LinkExtractor,
|
||||
LinkExtractorTransformer,
|
||||
)
|
||||
|
||||
TEXT1 = "Text1"
|
||||
TEXT2 = "Text2"
|
||||
|
||||
|
||||
class FakeKeywordExtractor(LinkExtractor[Document]):
|
||||
def extract_one(self, input: Document) -> Set[Link]:
|
||||
kws: Set[str] = set()
|
||||
if input.page_content == TEXT1:
|
||||
kws = {"a", "b"}
|
||||
elif input.page_content == TEXT2:
|
||||
kws = {"b", "c"}
|
||||
|
||||
return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
|
||||
|
||||
|
||||
class FakeHyperlinkExtractor(LinkExtractor[Document]):
|
||||
def extract_one(self, input: Document) -> Set[Link]:
|
||||
if input.page_content == TEXT1:
|
||||
return {
|
||||
Link.incoming(kind="fakehref", tag="http://text1"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
elif input.page_content == TEXT2:
|
||||
return {
|
||||
Link.incoming(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
|
||||
)
|
||||
|
||||
|
||||
def test_one_extractor() -> None:
|
||||
transformer = LinkExtractorTransformer(
|
||||
[
|
||||
FakeKeywordExtractor(),
|
||||
]
|
||||
)
|
||||
doc1 = Document(TEXT1)
|
||||
doc2 = Document(TEXT2)
|
||||
results = transformer.transform_documents([doc1, doc2])
|
||||
|
||||
assert set(get_links(results[0])) == {
|
||||
Link.bidir(kind="fakekw", tag="a"),
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
}
|
||||
|
||||
assert set(get_links(results[1])) == {
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
Link.bidir(kind="fakekw", tag="c"),
|
||||
}
|
||||
|
||||
|
||||
def test_multiple_extractors() -> None:
|
||||
transformer = LinkExtractorTransformer(
|
||||
[
|
||||
FakeKeywordExtractor(),
|
||||
FakeHyperlinkExtractor(),
|
||||
]
|
||||
)
|
||||
|
||||
doc1 = Document(TEXT1)
|
||||
doc2 = Document(TEXT2)
|
||||
|
||||
results = transformer.transform_documents([doc1, doc2])
|
||||
|
||||
assert set(get_links(results[0])) == {
|
||||
Link.bidir(kind="fakekw", tag="a"),
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
Link.incoming(kind="fakehref", tag="http://text1"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
||||
|
||||
assert set(get_links(results[1])) == {
|
||||
Link.bidir(kind="fakekw", tag="b"),
|
||||
Link.bidir(kind="fakekw", tag="c"),
|
||||
Link.incoming(kind="fakehref", tag="http://text2"),
|
||||
Link.outgoing(kind="fakehref", tag="http://text3"),
|
||||
}
|
@ -12,7 +12,7 @@ class Link:
|
||||
"""
|
||||
|
||||
kind: str
|
||||
"""The kind of link. Allows different extractors to use the same tag name without
|
||||
"""The kind of link. Allows different extractors to use the same tag name without
|
||||
creating collisions between extractors. For example “keyword” vs “url”."""
|
||||
direction: Literal["in", "out", "bidir"]
|
||||
"""The direction of the link."""
|
||||
@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
|
||||
links_in_metadata.extend(link)
|
||||
else:
|
||||
links_in_metadata.append(link)
|
||||
|
||||
|
||||
def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
|
||||
"""Return a document with the given links added.
|
||||
|
||||
Args:
|
||||
doc: The document to add the links to.
|
||||
*links: The links to add to the document.
|
||||
|
||||
Returns:
|
||||
A document with a shallow-copy of the metadata with the links added.
|
||||
"""
|
||||
new_links = set(get_links(doc))
|
||||
for link in links:
|
||||
if isinstance(link, Iterable):
|
||||
new_links.update(link)
|
||||
else:
|
||||
new_links.add(link)
|
||||
|
||||
return Document(
|
||||
page_content=doc.page_content,
|
||||
metadata={
|
||||
**doc.metadata,
|
||||
METADATA_LINKS_KEY: list(new_links),
|
||||
},
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user