Implement `max_marginal_relevance_search` in `VectorStore` of Pinecone (#6056)

This adds implementation of MMR search in pinecone; and I have two semi-related observations about this vector store class: - Maybe we should also have a `similarity_search_by_vector_returning_embeddings` like in supabase, but it's not in the base `VectorStore` class so I didn't implement - Talking about the base class, there's `similarity_search_with_relevance_scores`, but in pinecone it is called `similarity_search_with_score`; maybe we should consider renaming it to align with other `VectorStore` base and sub classes (or add that as an alias for backward compatibility) #### Who can review? Tag maintainers/contributors who might be interested: - VectorStores / Retrievers / Memory - @dev2049
1 year ago · f9edf76e7c
parent 970b2f9d38
commit f9edf76e7c
2 changed files with 124 additions and 4 deletions
--- a/docs/modules/indexes/vectorstores/examples/pinecone.ipynb
+++ b/docs/modules/indexes/vectorstores/examples/pinecone.ipynb
@ -24,7 +24,7 @@
   },
   "outputs": [],
   "source": [
-    "!pip install pinecone-client"
+    "!pip install pinecone-client openai tiktoken"
   ]
  },
  {
@ -70,7 +70,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "aac9563e",
   "metadata": {
    "tags": []
@ -85,7 +85,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "a3c3999a",
   "metadata": {},
   "outputs": [],
@ -135,13 +135,51 @@
    "print(docs[0].page_content)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "d46d1452",
   "metadata": {},
   "source": [
    "### Maximal Marginal Relevance Searches\n",
    "\n",
    "In addition to using similarity search in the retriever object, you can also use `mmr` as retriever.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a359ed74",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
    "retriever = docsearch.as_retriever(search_type=\"mmr\")\n",
    "matched_docs = retriever.get_relevant_documents(query)\n",
    "for i, d in enumerate(matched_docs):\n",
    "    print(f\"\\n## Document {i}\\n\")\n",
    "    print(d.page_content)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "7c477287",
   "metadata": {},
   "source": [
    "Or use `max_marginal_relevance_search` directly:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ca82740",
   "metadata": {},
   "outputs": [],
   "source": [
    "found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n",
    "for i, doc in enumerate(found_docs):\n",
    "    print(f\"{i + 1}.\", doc.page_content, \"\\n\")"
   ]
  }
 ],
 "metadata": {
--- a/langchain/vectorstores/pinecone.py
+++ b/langchain/vectorstores/pinecone.py
@ -5,9 +5,12 @@ import logging
 import uuid
 from typing import Any, Callable, Iterable, List, Optional, Tuple
 import numpy as np
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from langchain.vectorstores.base import VectorStore
 from langchain.vectorstores.utils import maximal_marginal_relevance
 logger = logging.getLogger(__name__)
@ -157,6 +160,85 @@ class Pinecone(VectorStore):
        )
        return [doc for doc, _ in docs_and_scores]
    def max_marginal_relevance_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        filter: Optional[dict] = None,
        namespace: Optional[str] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.
        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
        if namespace is None:
            namespace = self._namespace
        results = self._index.query(
            [embedding],
            top_k=fetch_k,
            include_values=True,
            include_metadata=True,
            namespace=namespace,
            filter=filter,
        )
        mmr_selected = maximal_marginal_relevance(
            np.array([embedding], dtype=np.float32),
            [item["values"] for item in results["matches"]],
            k=k,
            lambda_mult=lambda_mult,
        )
        selected = [results["matches"][i]["metadata"] for i in mmr_selected]
        return [
            Document(page_content=metadata.pop((self._text_key)), metadata=metadata)
            for metadata in selected
        ]
    def max_marginal_relevance_search(
        self,
        query: str,
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        filter: Optional[dict] = None,
        namespace: Optional[str] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs selected using the maximal marginal relevance.
        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.
        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
        Returns:
            List of Documents selected by maximal marginal relevance.
        """
        embedding = self._embedding_function(query)
        return self.max_marginal_relevance_search_by_vector(
            embedding, k, fetch_k, lambda_mult, filter, namespace
        )
    @classmethod
    def from_texts(
        cls,