community: Add docstring for KeybertLinkExtractor (#26210)

Co-authored-by: Erick Friis <erick@langchain.dev>
2024-11-10 01:10:59 +00:00 · 2024-09-10 02:26:29 +02:00 · 2024-09-10 02:26:29 +02:00 · e235a572a0
commit e235a572a0
parent bab9de581c
1 changed files with 99 additions and 7 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/keybert_link_extractor.py
@ -20,22 +20,114 @@ class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
        embedding_model: str = "all-MiniLM-L6-v2",
        extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
    ):
-        """Extract keywords using KeyBERT <https://maartengr.github.io/KeyBERT/>.
+        """Extract keywords using `KeyBERT <https://maartengr.github.io/KeyBERT/>`_.

-        Example:
+        KeyBERT is a minimal and easy-to-use keyword extraction technique that
+        leverages BERT embeddings to create keywords and keyphrases that are most
+        similar to a document.

-            .. code-block:: python
+        The KeybertLinkExtractor uses KeyBERT to create links between documents that
+        have keywords in common.

-                extractor = KeybertLinkExtractor()
+        Example::

-                results = extractor.extract_one(PAGE_1)
+            extractor = KeybertLinkExtractor()
+            results = extractor.extract_one("lorem ipsum...")
+
+        .. seealso::
+
+            - :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>`
+            - :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>`
+
+        How to link Documents on common keywords using Keybert
+        ======================================================
+
+        Preliminaries
+        -------------
+
+        Install the keybert package:
+
+        .. code-block:: bash
+
+            pip install -q langchain_community keybert
+
+        Usage
+        -----
+
+        We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we
+        extract keyword links and add them to the chunk.
+
+        Using extract_one()
+        ^^^^^^^^^^^^^^^^^^^
+
+        We can use :meth:`extract_one` on a document to get the links and add the links
+        to the document metadata with
+        :meth:`~langchain_core.graph_vectorstores.links.add_links`::
+
+            from langchain_community.document_loaders import TextLoader
+            from langchain_community.graph_vectorstores import CassandraGraphVectorStore
+            from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor
+            from langchain_core.graph_vectorstores.links import add_links
+            from langchain_text_splitters import CharacterTextSplitter
+
+            loader = TextLoader("state_of_the_union.txt")
+
+            raw_documents = loader.load()
+            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+
+            documents = text_splitter.split_documents(raw_documents)
+            keyword_extractor = KeybertLinkExtractor()
+
+            for document in documents:
+                links = keyword_extractor.extract_one(document)
+                add_links(document, links)
+
+            print(documents[0].metadata)
+
+        .. code-block:: output
+
+            {'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]}
+
+        Using LinkExtractorTransformer
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+        Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`,
+        we can simplify the link extraction::
+
+            from langchain_community.document_loaders import TextLoader
+            from langchain_community.graph_vectorstores.extractors import (
+                KeybertLinkExtractor,
+                LinkExtractorTransformer,
+            )
+            from langchain_text_splitters import CharacterTextSplitter
+
+            loader = TextLoader("state_of_the_union.txt")
+            raw_documents = loader.load()
+
+            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+            documents = text_splitter.split_documents(raw_documents)
+
+            transformer = LinkExtractorTransformer([KeybertLinkExtractor()])
+            documents = transformer.transform_documents(documents)
+
+            print(documents[0].metadata)
+
+        .. code-block:: output
+
+            {'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]}
+
+        The documents with keyword links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`::
+
+            from langchain_community.graph_vectorstores import CassandraGraphVectorStore
+
+            store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...)

        Args:
            kind: Kind of links to produce with this extractor.
            embedding_model: Name of the embedding model to use with KeyBERT.
            extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's
-                `extract_keywords` method.
-        """
+                ``extract_keywords`` method.
+        """  # noqa: E501
        try:
            import keybert