Support similarity search by vector (in FAISS) (#961)

Alternate implementation to PR #960 Again - only FAISS is implemented.
If accepted can add this to other vectorstores or leave as
NotImplemented? Suggestions welcome...
This commit is contained in:
seanaedmiston 2023-02-16 17:50:00 +11:00 committed by GitHub
parent 05ad399abe
commit f0a258555b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 144 additions and 23 deletions

View File

@ -297,6 +297,26 @@
"docs_and_scores[0]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "d5170563",
"metadata": {},
"source": [
"It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7675b0aa",
"metadata": {},
"outputs": [],
"source": [
"embedding_vector = embeddings.embed_query(query)\n",
"docs_and_scores = docsearch.similarity_search_by_vector(embedding_vector)"
]
},
{
"cell_type": "markdown",
"id": "b386dbb8",

View File

@ -31,6 +31,20 @@ class VectorStore(ABC):
) -> List[Document]:
"""Return docs most similar to query."""
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query vector.
"""
raise NotImplementedError
def max_marginal_relevance_search(
self, query: str, k: int = 4, fetch_k: int = 20
) -> List[Document]:
@ -49,6 +63,24 @@ class VectorStore(ABC):
"""
raise NotImplementedError
def max_marginal_relevance_search_by_vector(
self, embedding: List[float], k: int = 4, fetch_k: int = 20
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents selected by maximal marginal relevance.
"""
raise NotImplementedError
@classmethod
def from_documents(
cls,

View File

@ -92,6 +92,31 @@ class FAISS(VectorStore):
self.index_to_docstore_id.update(index_to_id)
return [_id for _, _id, _ in full_info]
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query and score for each
"""
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
docs = []
for j, i in enumerate(indices[0]):
if i == -1:
# This happens when not enough docs are returned.
continue
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append((doc, scores[0][j]))
return docs
def similarity_search_with_score(
self, query: str, k: int = 4
) -> List[Tuple[Document, float]]:
@ -105,19 +130,24 @@ class FAISS(VectorStore):
List of Documents most similar to the query and score for each
"""
embedding = self.embedding_function(query)
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
docs = []
for j, i in enumerate(indices[0]):
if i == -1:
# This happens when not enough docs are returned.
continue
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append((doc, scores[0][j]))
docs = self.similarity_search_with_score_by_vector(embedding, k)
return docs
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to embedding vector.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the embedding.
"""
docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k)
return [doc for doc, _ in docs_and_scores]
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
@ -133,6 +163,38 @@ class FAISS(VectorStore):
docs_and_scores = self.similarity_search_with_score(query, k)
return [doc for doc, _ in docs_and_scores]
def max_marginal_relevance_search_by_vector(
self, embedding: List[float], k: int = 4, fetch_k: int = 20
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
Returns:
List of Documents selected by maximal marginal relevance.
"""
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
# -1 happens when not enough docs are returned.
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32), embeddings, k=k
)
selected_indices = [indices[0][i] for i in mmr_selected]
docs = []
for i in selected_indices:
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append(doc)
return docs
def max_marginal_relevance_search(
self, query: str, k: int = 4, fetch_k: int = 20
) -> List[Document]:
@ -150,18 +212,7 @@ class FAISS(VectorStore):
List of Documents selected by maximal marginal relevance.
"""
embedding = self.embedding_function(query)
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
# -1 happens when not enough docs are returned.
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
selected_indices = [indices[0][i] for i in mmr_selected]
docs = []
for i in selected_indices:
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
docs.append(doc)
docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
return docs
@classmethod

View File

@ -27,6 +27,24 @@ def test_faiss() -> None:
assert output == [Document(page_content="foo")]
def test_faiss_vector_sim() -> None:
"""Test vector similarity."""
texts = ["foo", "bar", "baz"]
docsearch = FAISS.from_texts(texts, FakeEmbeddings())
index_to_id = docsearch.index_to_docstore_id
expected_docstore = InMemoryDocstore(
{
index_to_id[0]: Document(page_content="foo"),
index_to_id[1]: Document(page_content="bar"),
index_to_id[2]: Document(page_content="baz"),
}
)
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.similarity_search_by_vector(query_vec, k=1)
assert output == [Document(page_content="foo")]
def test_faiss_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]