Refactor Milvus/Zilliz (#3047)

Refactoring milvus/zilliz to clean up and have a more consistent
experience.

Signed-off-by: Filip Haltmayer <filip.haltmayer@zilliz.com>
fix_agent_callbacks
Filip Haltmayer 1 year ago committed by GitHub
parent 8191c6b81a
commit 215dcc2d26
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,6 +13,7 @@ from langchain.vectorstores.pinecone import Pinecone
from langchain.vectorstores.qdrant import Qdrant
from langchain.vectorstores.supabase import SupabaseVectorStore
from langchain.vectorstores.weaviate import Weaviate
from langchain.vectorstores.zilliz import Zilliz
__all__ = [
"ElasticVectorSearch",
@ -22,6 +23,7 @@ __all__ = [
"Weaviate",
"Qdrant",
"Milvus",
"Zilliz",
"Chroma",
"OpenSearchVectorSearch",
"AtlasDB",

File diff suppressed because it is too large Load Diff

@ -0,0 +1,106 @@
from __future__ import annotations
import logging
from typing import Any, List, Optional
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.milvus import Milvus
logger = logging.getLogger(__name__)
class Zilliz(Milvus):
def _create_index(self) -> None:
"""Create a index on the collection"""
from pymilvus import Collection, MilvusException
if isinstance(self.col, Collection) and self._get_index() is None:
try:
# If no index params, use a default AutoIndex based one
if self.index_params is None:
self.index_params = {
"metric_type": "L2",
"index_type": "AUTOINDEX",
"params": {},
}
try:
self.col.create_index(
self._vector_field,
index_params=self.index_params,
using=self.alias,
)
# If default did not work, most likely Milvus self-hosted
except MilvusException:
# Use HNSW based index
self.index_params = {
"metric_type": "L2",
"index_type": "HNSW",
"params": {"M": 8, "efConstruction": 64},
}
self.col.create_index(
self._vector_field,
index_params=self.index_params,
using=self.alias,
)
logger.debug(
"Successfully created an index on collection: %s",
self.collection_name,
)
except MilvusException as e:
logger.error(
"Failed to create an index on collection: %s", self.collection_name
)
raise e
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = "LangChainCollection",
connection_args: dict[str, Any] = {},
consistency_level: str = "Session",
index_params: Optional[dict] = None,
search_params: Optional[dict] = None,
drop_old: bool = False,
**kwargs: Any,
) -> Zilliz:
"""Create a Zilliz collection, indexes it with HNSW, and insert data.
Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
collection_name (str, optional): Collection name to use. Defaults to
"LangChainCollection".
connection_args (dict[str, Any], optional): Connection args to use. Defaults
to DEFAULT_MILVUS_CONNECTION.
consistency_level (str, optional): Which consistency level to use. Defaults
to "Session".
index_params (Optional[dict], optional): Which index_params to use.
Defaults to None.
search_params (Optional[dict], optional): Which search params to use.
Defaults to None.
drop_old (Optional[bool], optional): Whether to drop the collection with
that name if it exists. Defaults to False.
Returns:
Zilliz: Zilliz Vector Store
"""
vector_db = cls(
embedding_function=embedding,
collection_name=collection_name,
connection_args=connection_args,
consistency_level=consistency_level,
index_params=index_params,
search_params=search_params,
drop_old=drop_old,
**kwargs,
)
vector_db.add_texts(texts=texts, metadatas=metadatas)
return vector_db

@ -9,12 +9,15 @@ from tests.integration_tests.vectorstores.fake_embeddings import (
)
def _milvus_from_texts(metadatas: Optional[List[dict]] = None) -> Milvus:
def _milvus_from_texts(
metadatas: Optional[List[dict]] = None, drop: bool = True
) -> Milvus:
return Milvus.from_texts(
fake_texts,
FakeEmbeddings(),
metadatas=metadatas,
connection_args={"host": "127.0.0.1", "port": "19530"},
drop_old=drop,
)
@ -51,3 +54,36 @@ def test_milvus_max_marginal_relevance_search() -> None:
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="baz", metadata={"page": 2}),
]
def test_milvus_add_extra() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas)
docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6
def test_milvus_no_drop() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _milvus_from_texts(metadatas=metadatas)
del docsearch
docsearch = _milvus_from_texts(metadatas=metadatas, drop=False)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6
# if __name__ == "__main__":
# test_milvus()
# test_milvus_with_score()
# test_milvus_max_marginal_relevance_search()
# test_milvus_add_extra()
# test_milvus_no_drop()

@ -0,0 +1,94 @@
"""Test Zilliz functionality."""
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.vectorstores import Zilliz
from tests.integration_tests.vectorstores.fake_embeddings import (
FakeEmbeddings,
fake_texts,
)
def _zilliz_from_texts(
metadatas: Optional[List[dict]] = None, drop: bool = True
) -> Zilliz:
return Zilliz.from_texts(
fake_texts,
FakeEmbeddings(),
metadatas=metadatas,
connection_args={
"uri": "",
"user": "",
"password": "",
"secure": True,
},
drop_old=drop,
)
def test_zilliz() -> None:
"""Test end to end construction and search."""
docsearch = _zilliz_from_texts()
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_zilliz_with_score() -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _zilliz_from_texts(metadatas=metadatas)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
scores = [o[1] for o in output]
assert docs == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
Document(page_content="baz", metadata={"page": 2}),
]
assert scores[0] < scores[1] < scores[2]
def test_zilliz_max_marginal_relevance_search() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _zilliz_from_texts(metadatas=metadatas)
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
assert output == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="baz", metadata={"page": 2}),
]
def test_zilliz_add_extra() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _zilliz_from_texts(metadatas=metadatas)
docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6
def test_zilliz_no_drop() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _zilliz_from_texts(metadatas=metadatas)
del docsearch
docsearch = _zilliz_from_texts(metadatas=metadatas, drop=False)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6
# if __name__ == "__main__":
# test_zilliz()
# test_zilliz_with_score()
# test_zilliz_max_marginal_relevance_search()
# test_zilliz_add_extra()
# test_zilliz_no_drop()
Loading…
Cancel
Save