community[patch], docs: Add API reference doc for GraphVectorStore (#25751)

2024-11-10 01:10:59 +00:00 · 2024-08-31 02:42:00 +02:00 · 2024-08-31 02:42:00 +02:00 · 0a752a74cc
commit 0a752a74cc
parent 28e2ec7603
5 changed files with 204 additions and 16 deletions
--- a/libs/community/langchain_community/graph_vectorstores/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/init.py
@ -1,3 +1,161 @@
-from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore
+"""**Graph Vector Store**

-__all__ = ["CassandraGraphVectorStore"]
+Sometimes embedding models don’t capture all the important relationships between
+documents.
+Graph Vector Stores are an extension to both vector stores and retrievers that allow
+documents to be explicitly connected to each other.
+
+Graph vector store retrievers use both vector similarity and links to find documents
+related to an unstructured query.
+
+Graphs allow linking between documents.
+Each document identifies tags that link to and from it.
+For example, a paragraph of text may be linked to URLs based on the anchor tags in
+it's content and linked from the URL(s) it is published at.
+
+Link extractors can be used to extract links from documents.
+
+Example:
+
+.. code-block:: python
+
+    graph_vector_store = CassandraGraphVectorStore()
+    link_extractor = HtmlLinkExtractor()
+    links = link_extractor.extract_one(HtmlInput(document.page_content, "http://mysite"))
+    add_links(document, links)
+    graph_vector_store.add_document(document)
+
+***********
+Get started
+***********
+
+We chunk the State of the Union text and split it into documents.
+
+.. code-block:: python
+
+    from langchain_community.document_loaders import TextLoader
+    from langchain_text_splitters import CharacterTextSplitter
+
+    raw_documents = TextLoader("state_of_the_union.txt").load()
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    documents = text_splitter.split_documents(raw_documents)
+
+Links can be added to documents manually but it's easier to use a
+:class:`~langchain_community.graph_vectorstores.extractors.LinkExtractor`.
+Several common link extractors are available and you can build your own.
+For this guide, we'll use the
+:class:`~langchain_community.graph_vectorstores.extractors.KeybertLinkExtractor`
+which uses the KeyBERT model to tag documents with keywords and uses these keywords to
+create links between documents.
+
+.. code-block:: python
+
+    from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor
+    from langchain_community.graph_vectorstores.links import add_links
+
+    extractor = KeybertLinkExtractor()
+
+    for doc in documents:
+        add_links(doc, extractor.extract_one(doc))
+
+***********************************************
+Create the graph vector store and add documents
+***********************************************
+
+We'll use an Apache Cassandra or Astra DB database as an example.
+We create a :class:`~langchain_community.graph_vectorstores.CassandraGraphVectorStore`
+from the documents and an :class:`~langchain_openai.OpenAIEmbeddings` model.
+
+.. code-block:: python
+
+    import cassio
+    from langchain_community.graph_vectorstores import CassandraGraphVectorStore
+    from langchain_openai import OpenAIEmbeddings
+
+    # Initialize cassio and the Cassandra session from the environment variables
+    cassio.init(auto=True)
+
+    store = CassandraGraphVectorStore.from_documents(
+        embedding=OpenAIEmbeddings(),
+        documents=documents,
+    )
+
+*****************
+Similarity search
+*****************
+
+If we don't traverse the graph, a graph vector store behaves like a regular vector
+store.
+So all methods available in a vector store are also available in a graph vector store.
+The :meth:`~langchain_community.graph_vectorstores.base.GraphVectorStore.similarity_search`
+method returns documents similar to a query without considering
+the links between documents.
+
+.. code-block:: python
+
+    docs = store.similarity_search(
+        "What did the president say about Ketanji Brown Jackson?"
+    )
+
+****************
+Traversal search
+****************
+
+The :meth:`~langchain_community.graph_vectorstores.base.GraphVectorStore.traversal_search`
+method returns documents similar to a query considering the links
+between documents. It first does a similarity search and then traverses the graph to
+find linked documents.
+
+.. code-block:: python
+
+    docs = list(
+        store.traversal_search("What did the president say about Ketanji Brown Jackson?")
+    )
+
+*************
+Async methods
+*************
+
+The graph vector store has async versions of the methods prefixed with ``a``.
+
+.. code-block:: python
+
+    docs = [
+        doc
+        async for doc in store.atraversal_search(
+            "What did the president say about Ketanji Brown Jackson?"
+        )
+    ]
+
+****************************
+Graph vector store retriever
+****************************
+
+The graph vector store can be converted to a retriever.
+It is similar to the vector store retriever but it also has traversal search methods
+such as ``traversal`` and ``mmr_traversal``.
+
+.. code-block:: python
+
+    retriever = store.as_retriever(search_type="mmr_traversal")
+    docs = retriever.invoke("What did the president say about Ketanji Brown Jackson?")
+
+"""  # noqa: E501
+
+from langchain_community.graph_vectorstores.base import (
+    GraphVectorStore,
+    GraphVectorStoreRetriever,
+    Node,
+)
+from langchain_community.graph_vectorstores.cassandra import CassandraGraphVectorStore
+from langchain_community.graph_vectorstores.links import (
+    Link,
+)
+
+__all__ = [
+    "GraphVectorStore",
+    "GraphVectorStoreRetriever",
+    "Node",
+    "Link",
+    "CassandraGraphVectorStore",
+]
--- a/libs/community/langchain_community/graph_vectorstores/base.py
+++ b/libs/community/langchain_community/graph_vectorstores/base.py
@ -0,0 +1,7 @@
+from langchain_core.graph_vectorstores.base import (
+    GraphVectorStore,
+    GraphVectorStoreRetriever,
+    Node,
+)
+
+__all__ = ["GraphVectorStore", "GraphVectorStoreRetriever", "Node"]
--- a/libs/community/langchain_community/graph_vectorstores/links.py
+++ b/libs/community/langchain_community/graph_vectorstores/links.py
@ -0,0 +1,8 @@
+from langchain_core.graph_vectorstores.links import (
+    Link,
+    add_links,
+    copy_with_links,
+    get_links,
+)
+
+__all__ = ["Link", "add_links", "get_links", "copy_with_links"]
--- a/libs/core/langchain_core/graph_vectorstores/base.py
+++ b/libs/core/langchain_core/graph_vectorstores/base.py
@ -38,10 +38,11 @@ class Node(Serializable):

    Edges exist from nodes with an outgoing link to nodes with a matching incoming link.

-    For instance two nodes `a` and `b` connected over a hyperlink `https://some-url`
+    For instance two nodes `a` and `b` connected over a hyperlink ``https://some-url``
    would look like:

    .. code-block:: python
+
        [
            Node(
                id="a",
@ -118,6 +119,13 @@ def _documents_to_nodes(documents: Iterable[Document]) -> Iterator[Node]:

@beta()
 def nodes_to_documents(nodes: Iterable[Node]) -> Iterator[Document]:
+    """Convert nodes to documents.
+
+    Args:
+        nodes: The nodes to convert to documents.
+    Returns:
+        The documents generated from the nodes.
+    """
    for node in nodes:
        metadata = node.metadata.copy()
        metadata[METADATA_LINKS_KEY] = [
@ -594,19 +602,24 @@ class GraphVectorStore(VectorStore):
        """Return GraphVectorStoreRetriever initialized from this GraphVectorStore.

        Args:
-            search_type (Optional[str]): Defines the type of search that
-                the Retriever should perform.
-                Can be "traversal" (default), "similarity", "mmr", or
-                "similarity_score_threshold".
-            search_kwargs (Optional[Dict]): Keyword arguments to pass to the
-                search function. Can include things like:
-                    k: Amount of documents to return (Default: 4)
-                    depth: The maximum depth of edges to traverse (Default: 1)
-                    score_threshold: Minimum relevance threshold
-                        for similarity_score_threshold
-                    fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
-                    lambda_mult: Diversity of results returned by MMR;
-                        1 for minimum diversity and 0 for maximum. (Default: 0.5)
+            **kwargs: Keyword arguments to pass to the search function.
+                Can include:
+
+                - search_type (Optional[str]): Defines the type of search that
+                  the Retriever should perform.
+                  Can be ``traversal`` (default), ``similarity``, ``mmr``, or
+                  ``similarity_score_threshold``.
+                - search_kwargs (Optional[Dict]): Keyword arguments to pass to the
+                  search function. Can include things like:
+
+                  - k(int): Amount of documents to return (Default: 4).
+                  - depth(int): The maximum depth of edges to traverse (Default: 1).
+                  - score_threshold(float): Minimum relevance threshold
+                    for similarity_score_threshold.
+                  - fetch_k(int): Amount of documents to pass to MMR algorithm
+                    (Default: 20).
+                  - lambda_mult(float): Diversity of results returned by MMR;
+                    1 for minimum diversity and 0 for maximum. (Default: 0.5).
        Returns:
            Retriever for this GraphVectorStore.

--- a/libs/core/langchain_core/graph_vectorstores/links.py
+++ b/libs/core/langchain_core/graph_vectorstores/links.py
@ -43,6 +43,7 @@ METADATA_LINKS_KEY = "links"
@beta()
 def get_links(doc: Document) -> List[Link]:
    """Get the links from a document.
+
    Args:
        doc: The document to get the link tags from.
    Returns:
@ -60,6 +61,7 @@ def get_links(doc: Document) -> List[Link]:
@beta()
 def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
    """Add links to the given metadata.
+
    Args:
        doc: The document to add the links to.
        *links: The links to add to the document.