mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Support similarity search by vector (in FAISS) (#961)
Alternate implementation to PR #960 Again - only FAISS is implemented. If accepted can add this to other vectorstores or leave as NotImplemented? Suggestions welcome...
This commit is contained in:
parent
05ad399abe
commit
f0a258555b
@ -297,6 +297,26 @@
|
|||||||
"docs_and_scores[0]"
|
"docs_and_scores[0]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d5170563",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"It is also possible to do a search for documents similar to a given embedding vector using `similarity_search_by_vector` which accepts an embedding vector as a parameter instead of a string."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7675b0aa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"embedding_vector = embeddings.embed_query(query)\n",
|
||||||
|
"docs_and_scores = docsearch.similarity_search_by_vector(embedding_vector)"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b386dbb8",
|
"id": "b386dbb8",
|
||||||
|
@ -31,6 +31,20 @@ class VectorStore(ABC):
|
|||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Return docs most similar to query."""
|
"""Return docs most similar to query."""
|
||||||
|
|
||||||
|
def similarity_search_by_vector(
|
||||||
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to embedding vector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query vector.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def max_marginal_relevance_search(
|
def max_marginal_relevance_search(
|
||||||
self, query: str, k: int = 4, fetch_k: int = 20
|
self, query: str, k: int = 4, fetch_k: int = 20
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
@ -49,6 +63,24 @@ class VectorStore(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def max_marginal_relevance_search_by_vector(
|
||||||
|
self, embedding: List[float], k: int = 4, fetch_k: int = 20
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance.
|
||||||
|
|
||||||
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
|
among selected documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents selected by maximal marginal relevance.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_documents(
|
def from_documents(
|
||||||
cls,
|
cls,
|
||||||
|
@ -92,6 +92,31 @@ class FAISS(VectorStore):
|
|||||||
self.index_to_docstore_id.update(index_to_id)
|
self.index_to_docstore_id.update(index_to_id)
|
||||||
return [_id for _, _id, _ in full_info]
|
return [_id for _, _id, _ in full_info]
|
||||||
|
|
||||||
|
def similarity_search_with_score_by_vector(
|
||||||
|
self, embedding: List[float], k: int = 4
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return docs most similar to query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query and score for each
|
||||||
|
"""
|
||||||
|
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
|
||||||
|
docs = []
|
||||||
|
for j, i in enumerate(indices[0]):
|
||||||
|
if i == -1:
|
||||||
|
# This happens when not enough docs are returned.
|
||||||
|
continue
|
||||||
|
_id = self.index_to_docstore_id[i]
|
||||||
|
doc = self.docstore.search(_id)
|
||||||
|
if not isinstance(doc, Document):
|
||||||
|
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||||
|
docs.append((doc, scores[0][j]))
|
||||||
|
return docs
|
||||||
|
|
||||||
def similarity_search_with_score(
|
def similarity_search_with_score(
|
||||||
self, query: str, k: int = 4
|
self, query: str, k: int = 4
|
||||||
) -> List[Tuple[Document, float]]:
|
) -> List[Tuple[Document, float]]:
|
||||||
@ -105,19 +130,24 @@ class FAISS(VectorStore):
|
|||||||
List of Documents most similar to the query and score for each
|
List of Documents most similar to the query and score for each
|
||||||
"""
|
"""
|
||||||
embedding = self.embedding_function(query)
|
embedding = self.embedding_function(query)
|
||||||
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
|
docs = self.similarity_search_with_score_by_vector(embedding, k)
|
||||||
docs = []
|
|
||||||
for j, i in enumerate(indices[0]):
|
|
||||||
if i == -1:
|
|
||||||
# This happens when not enough docs are returned.
|
|
||||||
continue
|
|
||||||
_id = self.index_to_docstore_id[i]
|
|
||||||
doc = self.docstore.search(_id)
|
|
||||||
if not isinstance(doc, Document):
|
|
||||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
|
||||||
docs.append((doc, scores[0][j]))
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def similarity_search_by_vector(
|
||||||
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to embedding vector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the embedding.
|
||||||
|
"""
|
||||||
|
docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k)
|
||||||
|
return [doc for doc, _ in docs_and_scores]
|
||||||
|
|
||||||
def similarity_search(
|
def similarity_search(
|
||||||
self, query: str, k: int = 4, **kwargs: Any
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
@ -133,6 +163,38 @@ class FAISS(VectorStore):
|
|||||||
docs_and_scores = self.similarity_search_with_score(query, k)
|
docs_and_scores = self.similarity_search_with_score(query, k)
|
||||||
return [doc for doc, _ in docs_and_scores]
|
return [doc for doc, _ in docs_and_scores]
|
||||||
|
|
||||||
|
def max_marginal_relevance_search_by_vector(
|
||||||
|
self, embedding: List[float], k: int = 4, fetch_k: int = 20
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance.
|
||||||
|
|
||||||
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
|
among selected documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents selected by maximal marginal relevance.
|
||||||
|
"""
|
||||||
|
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
|
||||||
|
# -1 happens when not enough docs are returned.
|
||||||
|
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
|
||||||
|
mmr_selected = maximal_marginal_relevance(
|
||||||
|
np.array([embedding], dtype=np.float32), embeddings, k=k
|
||||||
|
)
|
||||||
|
selected_indices = [indices[0][i] for i in mmr_selected]
|
||||||
|
docs = []
|
||||||
|
for i in selected_indices:
|
||||||
|
_id = self.index_to_docstore_id[i]
|
||||||
|
doc = self.docstore.search(_id)
|
||||||
|
if not isinstance(doc, Document):
|
||||||
|
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||||
|
docs.append(doc)
|
||||||
|
return docs
|
||||||
|
|
||||||
def max_marginal_relevance_search(
|
def max_marginal_relevance_search(
|
||||||
self, query: str, k: int = 4, fetch_k: int = 20
|
self, query: str, k: int = 4, fetch_k: int = 20
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
@ -150,18 +212,7 @@ class FAISS(VectorStore):
|
|||||||
List of Documents selected by maximal marginal relevance.
|
List of Documents selected by maximal marginal relevance.
|
||||||
"""
|
"""
|
||||||
embedding = self.embedding_function(query)
|
embedding = self.embedding_function(query)
|
||||||
_, indices = self.index.search(np.array([embedding], dtype=np.float32), fetch_k)
|
docs = self.max_marginal_relevance_search_by_vector(embedding, k, fetch_k)
|
||||||
# -1 happens when not enough docs are returned.
|
|
||||||
embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1]
|
|
||||||
mmr_selected = maximal_marginal_relevance(embedding, embeddings, k=k)
|
|
||||||
selected_indices = [indices[0][i] for i in mmr_selected]
|
|
||||||
docs = []
|
|
||||||
for i in selected_indices:
|
|
||||||
_id = self.index_to_docstore_id[i]
|
|
||||||
doc = self.docstore.search(_id)
|
|
||||||
if not isinstance(doc, Document):
|
|
||||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
|
||||||
docs.append(doc)
|
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -27,6 +27,24 @@ def test_faiss() -> None:
|
|||||||
assert output == [Document(page_content="foo")]
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
|
def test_faiss_vector_sim() -> None:
|
||||||
|
"""Test vector similarity."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings())
|
||||||
|
index_to_id = docsearch.index_to_docstore_id
|
||||||
|
expected_docstore = InMemoryDocstore(
|
||||||
|
{
|
||||||
|
index_to_id[0]: Document(page_content="foo"),
|
||||||
|
index_to_id[1]: Document(page_content="bar"),
|
||||||
|
index_to_id[2]: Document(page_content="baz"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.similarity_search_by_vector(query_vec, k=1)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
def test_faiss_with_metadatas() -> None:
|
def test_faiss_with_metadatas() -> None:
|
||||||
"""Test end to end construction and search."""
|
"""Test end to end construction and search."""
|
||||||
texts = ["foo", "bar", "baz"]
|
texts = ["foo", "bar", "baz"]
|
||||||
|
Loading…
Reference in New Issue
Block a user