mirror of https://github.com/hwchase17/langchain
This PR adds support for the Azure Cosmos DB MongoDB vCore Vector Store https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/ https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/vector-search Summary: - **Description:** added vector store integration for Azure Cosmos DB MongoDB vCore Vector Store, - **Issue:** the issue # it fixes #11627, - **Dependencies:** pymongo dependency, - **Tag maintainer:** @hwchase17, - **Twitter handle:** @izzyacademy --------- Co-authored-by: Israel Ekpo <israel.ekpo@gmail.com> Co-authored-by: Israel Ekpo <44282278+izzyacademy@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>pull/11685/head
parent
28ee6a7c12
commit
d0603c86b6
@ -0,0 +1,421 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Generator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pymongo.collection import Collection
|
||||
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
|
||||
|
||||
# Before Python 3.11 native StrEnum is not available
|
||||
class CosmosDBSimilarityType(str, Enum):
|
||||
COS = "COS" # CosineSimilarity
|
||||
IP = "IP" # inner - product
|
||||
L2 = "L2" # Euclidean distance
|
||||
|
||||
|
||||
CosmosDBDocumentType = TypeVar("CosmosDBDocumentType", bound=Dict[str, Any])
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_INSERT_BATCH_SIZE = 128
|
||||
|
||||
|
||||
class AzureCosmosDBVectorSearch(VectorStore):
|
||||
"""`Azure Cosmos DB for MongoDB vCore` vector store.
|
||||
|
||||
To use, you should have both:
|
||||
- the ``pymongo`` python package installed
|
||||
- a connection string associated with a MongoDB VCore Cluster
|
||||
|
||||
Example:
|
||||
. code-block:: python
|
||||
|
||||
from langchain.vectorstores import AzureCosmosDBVectorSearch
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from pymongo import MongoClient
|
||||
|
||||
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
|
||||
collection = mongo_client["<db_name>"]["<collection_name>"]
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = AzureCosmosDBVectorSearch(collection, embeddings)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
collection: Collection[CosmosDBDocumentType],
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
index_name: str = "vectorSearchIndex",
|
||||
text_key: str = "textContent",
|
||||
embedding_key: str = "vectorContent",
|
||||
):
|
||||
"""Constructor for AzureCosmosDBVectorSearch
|
||||
|
||||
Args:
|
||||
collection: MongoDB collection to add the texts to.
|
||||
embedding: Text embedding model to use.
|
||||
index_name: Name of the Atlas Search index.
|
||||
text_key: MongoDB field that will contain the text
|
||||
for each document.
|
||||
embedding_key: MongoDB field that will contain the embedding
|
||||
for each document.
|
||||
"""
|
||||
self._collection = collection
|
||||
self._embedding = embedding
|
||||
self._index_name = index_name
|
||||
self._text_key = text_key
|
||||
self._embedding_key = embedding_key
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self._embedding
|
||||
|
||||
def get_index_name(self) -> str:
|
||||
"""Returns the index name
|
||||
|
||||
Returns:
|
||||
Returns the index name
|
||||
|
||||
"""
|
||||
return self._index_name
|
||||
|
||||
@classmethod
|
||||
def from_connection_string(
|
||||
cls,
|
||||
connection_string: str,
|
||||
namespace: str,
|
||||
embedding: Embeddings,
|
||||
**kwargs: Any,
|
||||
) -> AzureCosmosDBVectorSearch:
|
||||
"""Creates an Instance of AzureCosmosDBVectorSearch from a Connection String
|
||||
|
||||
Args:
|
||||
connection_string: The MongoDB vCore instance connection string
|
||||
namespace: The namespace (database.collection)
|
||||
embedding: The embedding utility
|
||||
**kwargs: Dynamic keyword arguments
|
||||
|
||||
Returns:
|
||||
an instance of the vector store
|
||||
|
||||
"""
|
||||
try:
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import pymongo, please install it with "
|
||||
"`pip install pymongo`."
|
||||
)
|
||||
client: MongoClient = MongoClient(connection_string)
|
||||
db_name, collection_name = namespace.split(".")
|
||||
collection = client[db_name][collection_name]
|
||||
return cls(collection, embedding, **kwargs)
|
||||
|
||||
def index_exists(self) -> bool:
|
||||
"""Verifies if the specified index name during instance
|
||||
construction exists on the collection
|
||||
|
||||
Returns:
|
||||
Returns True on success and False if no such index exists
|
||||
on the collection
|
||||
"""
|
||||
cursor = self._collection.list_indexes()
|
||||
index_name = self._index_name
|
||||
|
||||
for res in cursor:
|
||||
current_index_name = res.pop("name")
|
||||
if current_index_name == index_name:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def delete_index(self) -> None:
|
||||
"""Deletes the index specified during instance construction if it exists"""
|
||||
if self.index_exists():
|
||||
self._collection.drop_index(self._index_name)
|
||||
# Raises OperationFailure on an error (e.g. trying to drop
|
||||
# an index that does not exist)
|
||||
|
||||
def create_index(
|
||||
self,
|
||||
num_lists: int = 100,
|
||||
dimensions: int = 1536,
|
||||
similarity: CosmosDBSimilarityType = CosmosDBSimilarityType.COS,
|
||||
) -> dict[str, Any]:
|
||||
"""Creates an index using the index name specified at
|
||||
instance construction
|
||||
|
||||
Setting the numLists parameter correctly is important for achieving
|
||||
good accuracy and performance.
|
||||
Since the vector store uses IVF as the indexing strategy,
|
||||
you should create the index only after you
|
||||
have loaded a large enough sample documents to ensure that the
|
||||
centroids for the respective buckets are
|
||||
faily distributed.
|
||||
|
||||
We recommend that numLists is set to documentCount/1000 for up
|
||||
to 1 million documents
|
||||
and to sqrt(documentCount) for more than 1 million documents.
|
||||
As the number of items in your database grows, you should
|
||||
tune numLists to be larger
|
||||
in order to achieve good latency performance for vector search.
|
||||
|
||||
If you're experimenting with a new scenario or creating a
|
||||
small demo, you can start with numLists
|
||||
set to 1 to perform a brute-force search across all vectors.
|
||||
This should provide you with the most
|
||||
accurate results from the vector search, however be aware that
|
||||
the search speed and latency will be slow.
|
||||
After your initial setup, you should go ahead and tune
|
||||
the numLists parameter using the above guidance.
|
||||
|
||||
Args:
|
||||
num_lists: This integer is the number of clusters that the
|
||||
inverted file (IVF) index uses to group the vector data.
|
||||
We recommend that numLists is set to documentCount/1000
|
||||
for up to 1 million documents and to sqrt(documentCount)
|
||||
for more than 1 million documents.
|
||||
Using a numLists value of 1 is akin to performing
|
||||
brute-force search, which has limited performance
|
||||
dimensions: Number of dimensions for vector similarity.
|
||||
The maximum number of supported dimensions is 2000
|
||||
similarity: Similarity metric to use with the IVF index.
|
||||
|
||||
Possible options are:
|
||||
- CosmosDBSimilarityType.COS (cosine distance),
|
||||
- CosmosDBSimilarityType.L2 (Euclidean distance), and
|
||||
- CosmosDBSimilarityType.IP (inner product).
|
||||
|
||||
Returns:
|
||||
An object describing the created index
|
||||
|
||||
"""
|
||||
# prepare the command
|
||||
create_index_commands = {
|
||||
"createIndexes": self._collection.name,
|
||||
"indexes": [
|
||||
{
|
||||
"name": self._index_name,
|
||||
"key": {"vectorContent": "cosmosSearch"},
|
||||
"cosmosSearchOptions": {
|
||||
"kind": "vector-ivf",
|
||||
"numLists": num_lists,
|
||||
"similarity": similarity,
|
||||
"dimensions": dimensions,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# retrieve the database object
|
||||
current_database = self._collection.database
|
||||
|
||||
# invoke the command from the database object
|
||||
create_index_responses: dict[str, Any] = current_database.command(
|
||||
create_index_commands
|
||||
)
|
||||
|
||||
return create_index_responses
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List:
|
||||
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
|
||||
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
result_ids = []
|
||||
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
|
||||
texts_batch.append(text)
|
||||
metadatas_batch.append(metadata)
|
||||
if (i + 1) % batch_size == 0:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
texts_batch = []
|
||||
metadatas_batch = []
|
||||
if texts_batch:
|
||||
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
|
||||
return result_ids
|
||||
|
||||
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
|
||||
"""Used to Load Documents into the collection
|
||||
|
||||
Args:
|
||||
texts: The list of documents strings to load
|
||||
metadatas: The list of metadata objects associated with each document
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
# If the text is empty, then exit early
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# Embed and create the documents
|
||||
embeddings = self._embedding.embed_documents(texts)
|
||||
to_insert = [
|
||||
{self._text_key: t, self._embedding_key: embedding, **m}
|
||||
for t, m, embedding in zip(texts, metadatas, embeddings)
|
||||
]
|
||||
# insert the documents in Cosmos DB
|
||||
insert_result = self._collection.insert_many(to_insert) # type: ignore
|
||||
return insert_result.inserted_ids
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
collection: Optional[Collection[CosmosDBDocumentType]] = None,
|
||||
**kwargs: Any,
|
||||
) -> AzureCosmosDBVectorSearch:
|
||||
if collection is None:
|
||||
raise ValueError("Must provide 'collection' named parameter.")
|
||||
vectorstore = cls(collection, embedding, **kwargs)
|
||||
vectorstore.add_texts(texts, metadatas=metadatas)
|
||||
return vectorstore
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
if ids is None:
|
||||
raise ValueError("No document ids provided to delete.")
|
||||
|
||||
for document_id in ids:
|
||||
self.delete_document_by_id(document_id)
|
||||
return True
|
||||
|
||||
def delete_document_by_id(self, document_id: Optional[str] = None) -> None:
|
||||
"""Removes a Specific Document by Id
|
||||
|
||||
Args:
|
||||
document_id: The document identifier
|
||||
"""
|
||||
try:
|
||||
from bson.objectid import ObjectId
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import bson, please install with `pip install bson`."
|
||||
) from e
|
||||
if document_id is None:
|
||||
raise ValueError("No document id provided to delete.")
|
||||
|
||||
self._collection.delete_one({"_id": ObjectId(document_id)})
|
||||
|
||||
def _similarity_search_with_score(
|
||||
self, embeddings: List[float], k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Returns a list of documents with their scores
|
||||
|
||||
Args:
|
||||
embeddings: The query vector
|
||||
k: the number of documents to return
|
||||
|
||||
Returns:
|
||||
A list of documents closest to the query vector
|
||||
"""
|
||||
pipeline: List[dict[str, Any]] = [
|
||||
{
|
||||
"$search": {
|
||||
"cosmosSearch": {
|
||||
"vector": embeddings,
|
||||
"path": self._embedding_key,
|
||||
"k": k,
|
||||
},
|
||||
"returnStoredSource": True,
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"similarityScore": {"$meta": "searchScore"},
|
||||
"document": "$$ROOT",
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
cursor = self._collection.aggregate(pipeline)
|
||||
|
||||
docs = []
|
||||
|
||||
for res in cursor:
|
||||
score = res.pop("similarityScore")
|
||||
document_object_field = res.pop("document")
|
||||
text = document_object_field.pop(self._text_key)
|
||||
docs.append(
|
||||
(Document(page_content=text, metadata=document_object_field), score)
|
||||
)
|
||||
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score(
|
||||
self, query: str, k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
docs = self._similarity_search_with_score(embeddings=embeddings, k=k)
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score(query, k=k)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# Retrieves the docs with similarity scores
|
||||
# sorted by similarity scores in DESC order
|
||||
docs = self._similarity_search_with_score(embedding, k=fetch_k)
|
||||
|
||||
# Re-ranks the docs using MMR
|
||||
mmr_doc_indexes = maximal_marginal_relevance(
|
||||
np.array(embedding),
|
||||
[doc.metadata[self._embedding_key] for doc, _ in docs],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
mmr_docs = [docs[i][0] for i in mmr_doc_indexes]
|
||||
return mmr_docs
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
# compute the embeddings vector from the query string
|
||||
embeddings = self._embedding.embed_query(query)
|
||||
|
||||
docs = self.max_marginal_relevance_search_by_vector(
|
||||
embeddings, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult
|
||||
)
|
||||
return docs
|
@ -0,0 +1,435 @@
|
||||
"""Test AzureCosmosDBVectorSearch functionality."""
|
||||
import logging
|
||||
import os
|
||||
from time import sleep
|
||||
from typing import Any, Generator, Optional, Union
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.azure_cosmos_db import (
|
||||
AzureCosmosDBVectorSearch,
|
||||
CosmosDBSimilarityType,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
model_deployment = os.getenv(
|
||||
"OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
|
||||
)
|
||||
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
||||
|
||||
INDEX_NAME = "langchain-test-index"
|
||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||
|
||||
num_lists = 3
|
||||
dimensions = 1536
|
||||
similarity_algorithm = CosmosDBSimilarityType.COS
|
||||
|
||||
|
||||
def prepare_collection() -> Any:
|
||||
from pymongo import MongoClient
|
||||
|
||||
test_client: MongoClient = MongoClient(CONNECTION_STRING)
|
||||
return test_client[DB_NAME][COLLECTION_NAME]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def collection() -> Any:
|
||||
return prepare_collection()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def azure_openai_embeddings() -> Any:
|
||||
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
|
||||
deployment=model_deployment, model=model_name, chunk_size=1
|
||||
)
|
||||
return openai_embeddings
|
||||
|
||||
|
||||
"""
|
||||
This is how to run the integration tests:
|
||||
|
||||
cd libs/langchain
|
||||
pytest tests/integration_tests/vectorstores/test_azure_cosmos_db.py
|
||||
"""
|
||||
|
||||
|
||||
class TestAzureCosmosDBVectorSearch:
|
||||
@classmethod
|
||||
def setup_class(cls) -> None:
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
raise ValueError("OPENAI_API_KEY environment variable is not set")
|
||||
|
||||
# insure the test collection is empty
|
||||
collection = prepare_collection()
|
||||
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls) -> None:
|
||||
collection = prepare_collection()
|
||||
# delete all the documents in the collection
|
||||
collection.delete_many({}) # type: ignore[index]
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self) -> None:
|
||||
collection = prepare_collection()
|
||||
# delete all the documents in the collection
|
||||
collection.delete_many({}) # type: ignore[index]
|
||||
|
||||
@pytest.fixture(scope="class", autouse=True)
|
||||
def cosmos_db_url(self) -> Union[str, Generator[str, None, None]]:
|
||||
"""Return the elasticsearch url."""
|
||||
return "805.555.1212"
|
||||
|
||||
def test_from_documents_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_documents(
|
||||
documents,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_documents_inner_product(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
documents = [
|
||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
||||
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
||||
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
||||
]
|
||||
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_documents(
|
||||
documents,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
sleep(1) # waits for Cosmos DB to save contents to the collection
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"That fence is purple.",
|
||||
]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_one(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
|
||||
first_document_id_object = output[0].metadata["_id"]
|
||||
first_document_id = str(first_document_id_object)
|
||||
|
||||
vectorstore.delete_document_by_id(first_document_id)
|
||||
sleep(2) # waits for the index to be updated
|
||||
|
||||
output2 = vectorstore.similarity_search("Sandwich", k=1)
|
||||
assert output2
|
||||
assert output2[0].page_content != "What is a sandwich?"
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_delete_multiple(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=5)
|
||||
|
||||
first_document_id_object = output[0].metadata["_id"]
|
||||
first_document_id = str(first_document_id_object)
|
||||
|
||||
output[1].metadata["_id"]
|
||||
second_document_id = output[1].metadata["_id"]
|
||||
|
||||
output[2].metadata["_id"]
|
||||
third_document_id = output[2].metadata["_id"]
|
||||
|
||||
document_ids = [first_document_id, second_document_id, third_document_id]
|
||||
vectorstore.delete(document_ids)
|
||||
sleep(2) # waits for the index to be updated
|
||||
|
||||
output_2 = vectorstore.similarity_search("Sandwich", k=5)
|
||||
assert output
|
||||
assert output_2
|
||||
|
||||
assert len(output) == 4 # we should see all the four documents
|
||||
assert (
|
||||
len(output_2) == 1
|
||||
) # we should see only one document left after three have been deleted
|
||||
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_inner_product(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_from_texts_with_metadatas_euclidean_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = [
|
||||
"Dogs are tough.",
|
||||
"Cats have fluff.",
|
||||
"What is a sandwich?",
|
||||
"The fence is purple.",
|
||||
]
|
||||
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
metadatas=metadatas,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.L2)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||
|
||||
assert output
|
||||
assert output[0].page_content == "What is a sandwich?"
|
||||
assert output[0].metadata["c"] == 1
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_cosine_distance(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.COS)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
query = "foo"
|
||||
output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
assert output[0].page_content == "foo"
|
||||
assert output[1].page_content != "foo"
|
||||
vectorstore.delete_index()
|
||||
|
||||
def test_max_marginal_relevance_inner_product(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
||||
texts,
|
||||
azure_openai_embeddings,
|
||||
collection=collection,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
|
||||
# Create the IVF index that will be leveraged later for vector search
|
||||
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
|
||||
sleep(2) # waits for the index to be set up
|
||||
|
||||
query = "foo"
|
||||
output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
|
||||
|
||||
assert len(output) == len(texts)
|
||||
assert output[0].page_content == "foo"
|
||||
assert output[1].page_content != "foo"
|
||||
vectorstore.delete_index()
|
||||
|
||||
def invoke_delete_with_no_args(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> Optional[bool]:
|
||||
vectorstore: AzureCosmosDBVectorSearch = (
|
||||
AzureCosmosDBVectorSearch.from_connection_string(
|
||||
CONNECTION_STRING,
|
||||
NAMESPACE,
|
||||
azure_openai_embeddings,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
)
|
||||
|
||||
return vectorstore.delete()
|
||||
|
||||
def invoke_delete_by_id_with_no_args(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
vectorstore: AzureCosmosDBVectorSearch = (
|
||||
AzureCosmosDBVectorSearch.from_connection_string(
|
||||
CONNECTION_STRING,
|
||||
NAMESPACE,
|
||||
azure_openai_embeddings,
|
||||
index_name=INDEX_NAME,
|
||||
)
|
||||
)
|
||||
|
||||
vectorstore.delete_document_by_id()
|
||||
|
||||
def test_invalid_arguments_to_delete(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
with pytest.raises(ValueError) as exception_info:
|
||||
self.invoke_delete_with_no_args(azure_openai_embeddings, collection)
|
||||
assert str(exception_info.value) == "No document ids provided to delete."
|
||||
|
||||
def test_no_arguments_to_delete_by_id(
|
||||
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
||||
) -> None:
|
||||
with pytest.raises(Exception) as exception_info:
|
||||
self.invoke_delete_by_id_with_no_args(azure_openai_embeddings, collection)
|
||||
assert str(exception_info.value) == "No document id provided to delete."
|
Loading…
Reference in New Issue