mirror of https://github.com/hwchase17/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
436 lines
15 KiB
Python
436 lines
15 KiB
Python
"""Test AzureCosmosDBVectorSearch functionality."""
|
|
import logging
|
|
import os
|
|
from time import sleep
|
|
from typing import Any, Generator, Optional, Union
|
|
|
|
import pytest
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.embeddings import OpenAIEmbeddings
|
|
from langchain_community.vectorstores.azure_cosmos_db import (
|
|
AzureCosmosDBVectorSearch,
|
|
CosmosDBSimilarityType,
|
|
)
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
model_deployment = os.getenv(
|
|
"OPENAI_EMBEDDINGS_DEPLOYMENT", "smart-agent-embedding-ada"
|
|
)
|
|
model_name = os.getenv("OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
|
|
|
|
INDEX_NAME = "langchain-test-index"
|
|
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
|
CONNECTION_STRING: str = os.environ.get("MONGODB_VCORE_URI", "")
|
|
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
|
|
|
num_lists = 3
|
|
dimensions = 1536
|
|
similarity_algorithm = CosmosDBSimilarityType.COS
|
|
|
|
|
|
def prepare_collection() -> Any:
|
|
from pymongo import MongoClient
|
|
|
|
test_client: MongoClient = MongoClient(CONNECTION_STRING)
|
|
return test_client[DB_NAME][COLLECTION_NAME]
|
|
|
|
|
|
@pytest.fixture()
|
|
def collection() -> Any:
|
|
return prepare_collection()
|
|
|
|
|
|
@pytest.fixture()
|
|
def azure_openai_embeddings() -> Any:
|
|
openai_embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
|
|
deployment=model_deployment, model=model_name, chunk_size=1
|
|
)
|
|
return openai_embeddings
|
|
|
|
|
|
"""
|
|
This is how to run the integration tests:
|
|
|
|
cd libs/langchain
|
|
pytest tests/integration_tests/vectorstores/test_azure_cosmos_db.py
|
|
"""
|
|
|
|
|
|
class TestAzureCosmosDBVectorSearch:
|
|
@classmethod
|
|
def setup_class(cls) -> None:
|
|
if not os.getenv("OPENAI_API_KEY"):
|
|
raise ValueError("OPENAI_API_KEY environment variable is not set")
|
|
|
|
# insure the test collection is empty
|
|
collection = prepare_collection()
|
|
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
|
|
|
@classmethod
|
|
def teardown_class(cls) -> None:
|
|
collection = prepare_collection()
|
|
# delete all the documents in the collection
|
|
collection.delete_many({}) # type: ignore[index]
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup(self) -> None:
|
|
collection = prepare_collection()
|
|
# delete all the documents in the collection
|
|
collection.delete_many({}) # type: ignore[index]
|
|
|
|
@pytest.fixture(scope="class", autouse=True)
|
|
def cosmos_db_url(self) -> Union[str, Generator[str, None, None]]:
|
|
"""Return the elasticsearch url."""
|
|
return "805.555.1212"
|
|
|
|
def test_from_documents_cosine_distance(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
"""Test end to end construction and search."""
|
|
documents = [
|
|
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
|
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
|
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
|
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
|
]
|
|
|
|
vectorstore = AzureCosmosDBVectorSearch.from_documents(
|
|
documents,
|
|
azure_openai_embeddings,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
sleep(1) # waits for Cosmos DB to save contents to the collection
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
assert output[0].metadata["c"] == 1
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_documents_inner_product(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
"""Test end to end construction and search."""
|
|
documents = [
|
|
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
|
Document(page_content="Cats have fluff.", metadata={"b": 1}),
|
|
Document(page_content="What is a sandwich?", metadata={"c": 1}),
|
|
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
|
|
]
|
|
|
|
vectorstore = AzureCosmosDBVectorSearch.from_documents(
|
|
documents,
|
|
azure_openai_embeddings,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
sleep(1) # waits for Cosmos DB to save contents to the collection
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
assert output[0].metadata["c"] == 1
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_texts_cosine_distance(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = [
|
|
"Dogs are tough.",
|
|
"Cats have fluff.",
|
|
"What is a sandwich?",
|
|
"That fence is purple.",
|
|
]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_texts_with_metadatas_cosine_distance(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = [
|
|
"Dogs are tough.",
|
|
"Cats have fluff.",
|
|
"What is a sandwich?",
|
|
"The fence is purple.",
|
|
]
|
|
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
metadatas=metadatas,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
assert output[0].metadata["c"] == 1
|
|
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_texts_with_metadatas_delete_one(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = [
|
|
"Dogs are tough.",
|
|
"Cats have fluff.",
|
|
"What is a sandwich?",
|
|
"The fence is purple.",
|
|
]
|
|
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
metadatas=metadatas,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
assert output[0].metadata["c"] == 1
|
|
|
|
first_document_id_object = output[0].metadata["_id"]
|
|
first_document_id = str(first_document_id_object)
|
|
|
|
vectorstore.delete_document_by_id(first_document_id)
|
|
sleep(2) # waits for the index to be updated
|
|
|
|
output2 = vectorstore.similarity_search("Sandwich", k=1)
|
|
assert output2
|
|
assert output2[0].page_content != "What is a sandwich?"
|
|
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_texts_with_metadatas_delete_multiple(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = [
|
|
"Dogs are tough.",
|
|
"Cats have fluff.",
|
|
"What is a sandwich?",
|
|
"The fence is purple.",
|
|
]
|
|
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
metadatas=metadatas,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, similarity_algorithm)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=5)
|
|
|
|
first_document_id_object = output[0].metadata["_id"]
|
|
first_document_id = str(first_document_id_object)
|
|
|
|
output[1].metadata["_id"]
|
|
second_document_id = output[1].metadata["_id"]
|
|
|
|
output[2].metadata["_id"]
|
|
third_document_id = output[2].metadata["_id"]
|
|
|
|
document_ids = [first_document_id, second_document_id, third_document_id]
|
|
vectorstore.delete(document_ids)
|
|
sleep(2) # waits for the index to be updated
|
|
|
|
output_2 = vectorstore.similarity_search("Sandwich", k=5)
|
|
assert output
|
|
assert output_2
|
|
|
|
assert len(output) == 4 # we should see all the four documents
|
|
assert (
|
|
len(output_2) == 1
|
|
) # we should see only one document left after three have been deleted
|
|
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_texts_with_metadatas_inner_product(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = [
|
|
"Dogs are tough.",
|
|
"Cats have fluff.",
|
|
"What is a sandwich?",
|
|
"The fence is purple.",
|
|
]
|
|
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
metadatas=metadatas,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
assert output[0].metadata["c"] == 1
|
|
vectorstore.delete_index()
|
|
|
|
def test_from_texts_with_metadatas_euclidean_distance(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = [
|
|
"Dogs are tough.",
|
|
"Cats have fluff.",
|
|
"What is a sandwich?",
|
|
"The fence is purple.",
|
|
]
|
|
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
metadatas=metadatas,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.L2)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
|
|
|
assert output
|
|
assert output[0].page_content == "What is a sandwich?"
|
|
assert output[0].metadata["c"] == 1
|
|
vectorstore.delete_index()
|
|
|
|
def test_max_marginal_relevance_cosine_distance(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = ["foo", "foo", "fou", "foy"]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.COS)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
query = "foo"
|
|
output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
|
|
|
|
assert len(output) == len(texts)
|
|
assert output[0].page_content == "foo"
|
|
assert output[1].page_content != "foo"
|
|
vectorstore.delete_index()
|
|
|
|
def test_max_marginal_relevance_inner_product(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
texts = ["foo", "foo", "fou", "foy"]
|
|
vectorstore = AzureCosmosDBVectorSearch.from_texts(
|
|
texts,
|
|
azure_openai_embeddings,
|
|
collection=collection,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
|
|
# Create the IVF index that will be leveraged later for vector search
|
|
vectorstore.create_index(num_lists, dimensions, CosmosDBSimilarityType.IP)
|
|
sleep(2) # waits for the index to be set up
|
|
|
|
query = "foo"
|
|
output = vectorstore.max_marginal_relevance_search(query, k=10, lambda_mult=0.1)
|
|
|
|
assert len(output) == len(texts)
|
|
assert output[0].page_content == "foo"
|
|
assert output[1].page_content != "foo"
|
|
vectorstore.delete_index()
|
|
|
|
def invoke_delete_with_no_args(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> Optional[bool]:
|
|
vectorstore: AzureCosmosDBVectorSearch = (
|
|
AzureCosmosDBVectorSearch.from_connection_string(
|
|
CONNECTION_STRING,
|
|
NAMESPACE,
|
|
azure_openai_embeddings,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
)
|
|
|
|
return vectorstore.delete()
|
|
|
|
def invoke_delete_by_id_with_no_args(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
vectorstore: AzureCosmosDBVectorSearch = (
|
|
AzureCosmosDBVectorSearch.from_connection_string(
|
|
CONNECTION_STRING,
|
|
NAMESPACE,
|
|
azure_openai_embeddings,
|
|
index_name=INDEX_NAME,
|
|
)
|
|
)
|
|
|
|
vectorstore.delete_document_by_id()
|
|
|
|
def test_invalid_arguments_to_delete(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
with pytest.raises(ValueError) as exception_info:
|
|
self.invoke_delete_with_no_args(azure_openai_embeddings, collection)
|
|
assert str(exception_info.value) == "No document ids provided to delete."
|
|
|
|
def test_no_arguments_to_delete_by_id(
|
|
self, azure_openai_embeddings: OpenAIEmbeddings, collection: Any
|
|
) -> None:
|
|
with pytest.raises(Exception) as exception_info:
|
|
self.invoke_delete_by_id_with_no_args(azure_openai_embeddings, collection)
|
|
assert str(exception_info.value) == "No document id provided to delete."
|