mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Support similarity search by vector (in FAISS) (#961)
Alternate implementation to PR #960 Again - only FAISS is implemented. If accepted can add this to other vectorstores or leave as NotImplemented? Suggestions welcome...
This commit is contained in:
parent
05ad399abe
commit
f0a258555b
@ -297,6 +297,26 @@
|
||||
"docs_and_scores[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "d5170563",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7675b0aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embedding_vector = embeddings.embed_query(query)\n",
|
||||
"docs_and_scores = docsearch.similarity_search_by_vector(embedding_vector)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b386dbb8",
|
||||
|
@ -31,6 +31,20 @@ class VectorStore(ABC):
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query."""
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self, query: str, k: int = 4, fetch_k: int = 20
|
||||
) -> List[Document]:
|
||||
@ -49,6 +63,24 @@ class VectorStore(ABC):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, fetch_k: int = 20
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls,
|
||||
|
@ -92,6 +92,31 @@ class FAISS(VectorStore):
|
||||
self.index_to_docstore_id.update(index_to_id)
|
||||
return [_id for _, _id, _ in full_info]
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self, embedding: List[float], k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
|
||||
docs = []
|
||||
for j, i in enumerate(indices[0]):
|
||||
if i == -1:
|
||||
# This happens when not enough docs are returned.
|
||||
continue
|
||||
_id = self.index_to_docstore_id[i]
|
||||
doc = self.docstore.search(_id)
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
docs.append((doc, scores[0][j]))
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
@ -105,19 +130,24 @@ class FAISS(VectorStore):
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
embedding = self.embedding_function(query)
|
||||
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
|
||||
docs = []
|
||||
for j, i in enumerate(indices[0]):
|
||||
if i == -1:
|
||||
# This happens when not enough docs are returned.
|
||||
continue
|
||||
_id = self.index_to_docstore_id[i]
|
||||
doc = self.docstore.search(_id)
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
docs.append((doc, scores[0][j]))
|
||||
docs = self.similarity_search_with_score_by_vector(embedding, k)
|
||||
return docs
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the embedding.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
@ -133,6 +163,38 @@ class FAISS(VectorStore):
|
||||
docs_and_scores = self.similarity_search_with_score(query, k)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, fetch_k: int = 20
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
|
||||
# -1 happens when not enough docs are returned.
|
||||
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array([embedding], dtype=np.float32), embeddings, k=k
|
||||
)
|
||||
selected_indices = [indices[0][i] for i in mmr_selected]
|
||||
docs = []
|
||||
for i in selected_indices:
|
||||
_id = self.index_to_docstore_id[i]
|
||||
doc = self.docstore.search(_id)
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self, query: str, k: int = 4, fetch_k: int = 20
|
||||
) -> List[Document]:
|
||||
@ -150,18 +212,7 @@ class FAISS(VectorStore):
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
embedding = self.embedding_function(query)
|
||||
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
|
||||
# -1 happens when not enough docs are returned.
|
||||
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
|
||||
mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
|
||||
selected_indices = [indices[0][i] for i in mmr_selected]
|
||||
docs = []
|
||||
for i in selected_indices:
|
||||
_id = self.index_to_docstore_id[i]
|
||||
doc = self.docstore.search(_id)
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
docs.append(doc)
|
||||
docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
|
@ -27,6 +27,24 @@ def test_faiss() -> None:
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_faiss_vector_sim() -> None:
|
||||
"""Test vector similarity."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings())
|
||||
index_to_id = docsearch.index_to_docstore_id
|
||||
expected_docstore = InMemoryDocstore(
|
||||
{
|
||||
index_to_id[0]: Document(page_content="foo"),
|
||||
index_to_id[1]: Document(page_content="bar"),
|
||||
index_to_id[2]: Document(page_content="baz"),
|
||||
}
|
||||
)
|
||||
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.similarity_search_by_vector(query_vec, k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_faiss_with_metadatas() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
|
Loading…
Reference in New Issue
Block a user