MMR example selector (#377)

implement max marginal relevance example selector
1 year ago · 46c428234f
parent ffed5e0056
commit 46c428234f
6 changed files with 258 additions and 5 deletions
--- a/docs/examples/prompts/prompt_management.ipynb
+++ b/docs/examples/prompts/prompt_management.ipynb
@ -572,8 +572,8 @@
     "text": [
      "Give the antonym of every input\n",
      "\n",
-      "Input: tall\n",
-      "Output: short\n",
+      "Input: happy\n",
+      "Output: sad\n",
      "\n",
      "Input: fat\n",
      "Output:\n"
@ -597,8 +597,8 @@
     "text": [
      "Give the antonym of every input\n",
      "\n",
-      "Input: enthusiastic\n",
-      "Output: apathetic\n",
+      "Input: happy\n",
+      "Output: sad\n",
      "\n",
      "Input: joyful\n",
      "Output:\n"
@ -611,6 +611,110 @@
    "print(similar_prompt.format(adjective=\"joyful\"))"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "bc35afd0",
+   "metadata": {},
+   "source": [
+    "### Maximal Marginal Relevance ExampleSelector\n",
+    "\n",
+    "The MaxMarginalRelevanceExampleSelector selects examples based on a combination of which examples are most similar to the inputs, while also optimizing for diversity. It does this by finding the examples with the embeddings that have the greatest cosine similarity with the inputs, and then iteratively adding them while penalizing them for closeness to already selected examples.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ac95c968",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.prompts.example_selector import MaxMarginalRelevanceExampleSelector"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "db579bea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "example_selector = MaxMarginalRelevanceExampleSelector.from_examples(\n",
+    "    # This is the list of examples available to select from.\n",
+    "    examples, \n",
+    "    # This is the embedding class used to produce embeddings which are used to measure semantic similarity.\n",
+    "    OpenAIEmbeddings(), \n",
+    "    # This is the VectorStore class that is used to store the embeddings and do a similarity search over.\n",
+    "    FAISS, \n",
+    "    # This is the number of examples to produce.\n",
+    "    k=2\n",
+    ")\n",
+    "mmr_prompt = FewShotPromptTemplate(\n",
+    "    # We provide an ExampleSelector instead of examples.\n",
+    "    example_selector=example_selector,\n",
+    "    example_prompt=example_prompt,\n",
+    "    prefix=\"Give the antonym of every input\",\n",
+    "    suffix=\"Input: {adjective}\\nOutput:\", \n",
+    "    input_variables=[\"adjective\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "cd76e344",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Give the antonym of every input\n",
+      "\n",
+      "Input: happy\n",
+      "Output: sad\n",
+      "\n",
+      "Input: windy\n",
+      "Output: calm\n",
+      "\n",
+      "Input: worried\n",
+      "Output:\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Input is a feeling, so should select the happy/sad example as the first one\n",
+    "print(mmr_prompt.format(adjective=\"worried\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "cf82956b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Give the antonym of every input\n",
+      "\n",
+      "Input: happy\n",
+      "Output: sad\n",
+      "\n",
+      "Input: enthusiastic\n",
+      "Output: apathetic\n",
+      "\n",
+      "Input: worried\n",
+      "Output:\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Let's compare this to what we would just get if we went solely off of similarity\n",
+    "similar_prompt.example_selector.k = 2\n",
+    "print(similar_prompt.format(adjective=\"worried\"))"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "dbc32551",
--- a/langchain/prompts/example_selector/init.py
+++ b/langchain/prompts/example_selector/init.py
@ -1,7 +1,12 @@
 """Logic for selecting examples to include in prompts."""
 from langchain.prompts.example_selector.length_based import LengthBasedExampleSelector
 from langchain.prompts.example_selector.semantic_similarity import (
+    MaxMarginalRelevanceExampleSelector,
    SemanticSimilarityExampleSelector,
 )

-__all__ = ["LengthBasedExampleSelector", "SemanticSimilarityExampleSelector"]
+__all__ = [
+    "LengthBasedExampleSelector",
+    "SemanticSimilarityExampleSelector",
+    "MaxMarginalRelevanceExampleSelector",
+]
--- a/langchain/prompts/example_selector/semantic_similarity.py
+++ b/langchain/prompts/example_selector/semantic_similarity.py
@ -78,3 +78,59 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel):
            string_examples, embeddings, metadatas=examples, **vectorstore_cls_kwargs
        )
        return cls(vectorstore=vectorstore, k=k)
+
+
+class MaxMarginalRelevanceExampleSelector(SemanticSimilarityExampleSelector, BaseModel):
+    """ExampleSelector that selects examples based on Max Marginal Relevance.
+
+    This was shown to improve performance in this paper:
+    https://arxiv.org/pdf/2211.13892.pdf
+    """
+
+    fetch_k: int = 20
+    """Number of examples to fetch to rerank."""
+
+    def select_examples(self, input_variables: Dict[str, str]) -> List[dict]:
+        """Select which examples to use based on semantic similarity."""
+        # Get the docs with the highest similarity.
+        query = " ".join(sorted_values(input_variables))
+        example_docs = self.vectorstore.max_marginal_relevance_search(
+            query, k=self.k, fetch_k=self.fetch_k
+        )
+        # Get the examples from the metadata.
+        # This assumes that examples are stored in metadata.
+        examples = [dict(e.metadata) for e in example_docs]
+        # If example keys are provided, filter examples to those keys.
+        if self.example_keys:
+            examples = [{k: eg[k] for k in self.example_keys} for eg in examples]
+        return examples
+
+    @classmethod
+    def from_examples(
+        cls,
+        examples: List[dict],
+        embeddings: Embeddings,
+        vectorstore_cls: VectorStore,
+        k: int = 4,
+        fetch_k: int = 20,
+        **vectorstore_cls_kwargs: Any,
+    ) -> MaxMarginalRelevanceExampleSelector:
+        """Create k-shot example selector using example list and embeddings.
+
+        Reshuffles examples dynamically based on query similarity.
+
+        Args:
+            examples: List of examples to use in the prompt.
+            embeddings: An iniialized embedding API interface, e.g. OpenAIEmbeddings().
+            vectorstore_cls: A vector store DB interface class, e.g. FAISS.
+            k: Number of examples to select
+            vectorstore_cls_kwargs: optional kwargs containing url for vector store
+
+        Returns:
+            The ExampleSelector instantiated, backed by a vector store.
+        """
+        string_examples = [" ".join(sorted_values(eg)) for eg in examples]
+        vectorstore = vectorstore_cls.from_texts(
+            string_examples, embeddings, metadatas=examples, **vectorstore_cls_kwargs
+        )
+        return cls(vectorstore=vectorstore, k=k, fetch_k=fetch_k)
--- a/langchain/vectorstores/base.py
+++ b/langchain/vectorstores/base.py
@ -29,6 +29,24 @@ class VectorStore(ABC):
    def similarity_search(self, query: str, k: int = 4) -> List[Document]:
        """Return docs most similar to query."""

+    def max_marginal_relevance_search(
+        self, query: str, k: int = 4, fetch_k: int = 20
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        raise NotImplementedError
+
    @classmethod
    @abstractmethod
    def from_texts(
--- a/langchain/vectorstores/faiss.py
+++ b/langchain/vectorstores/faiss.py
@ -11,6 +11,7 @@ from langchain.docstore.document import Document
 from langchain.docstore.in_memory import InMemoryDocstore
 from langchain.embeddings.base import Embeddings
 from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.utils import maximal_marginal_relevance


 class FAISS(VectorStore):
@ -100,6 +101,37 @@ class FAISS(VectorStore):
            docs.append(doc)
        return docs

+    def max_marginal_relevance_search(
+        self, query: str, k: int = 4, fetch_k: int = 20
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        embedding = self.embedding_function(query)
+        _, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
+        # -1 happens when not enough docs are returned.
+        embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
+        mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
+        selected_indices = [indices[0][i] for i in mmr_selected]
+        docs = []
+        for i in selected_indices:
+            _id = self.index_to_docstore_id[i]
+            doc = self.docstore.search(_id)
+            if not isinstance(doc, Document):
+                raise ValueError(f"Could not find document for id {_id}, got {doc}")
+            docs.append(doc)
+        return docs
+
    @classmethod
    def from_texts(
        cls,
--- a/langchain/vectorstores/utils.py
+++ b/langchain/vectorstores/utils.py
@ -0,0 +1,38 @@
+"""Utility functions for working with vectors and vectorstores."""
+
+from typing import List
+
+import numpy as np
+
+
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """Calculate cosine similarity with numpy."""
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+
+def maximal_marginal_relevance(
+    query_embedding: np.ndarray,
+    embedding_list: list,
+    lambda_mult: float = 0.5,
+    k: int = 4,
+) -> List[int]:
+    """Calculate maximal marginal relevance."""
+    idxs: List[int] = []
+    while len(idxs) < k:
+        best_score = -np.inf
+        idx_to_add = -1
+        for i, emb in enumerate(embedding_list):
+            if i in idxs:
+                continue
+            first_part = cosine_similarity(query_embedding, emb)
+            second_part = 0.0
+            for j in idxs:
+                cos_sim = cosine_similarity(emb, embedding_list[j])
+                if cos_sim > second_part:
+                    second_part = cos_sim
+            equation_score = lambda_mult * first_part - (1 - lambda_mult) * second_part
+            if equation_score > best_score:
+                best_score = equation_score
+                idx_to_add = i
+        idxs.append(idx_to_add)
+    return idxs