langchain/libs/community/langchain_community/vectorstores/zilliz.py
Jael Gu a1aa3a657c
community[patch]: Milvus supports add & delete texts by ids (#16256)
# Description

To support [langchain
indexing](https://python.langchain.com/docs/modules/data_connection/indexing)
as requested by users, vectorstore Milvus needs to support:
- document addition by id (`add_documents` method with `ids` argument)
- delete by id (`delete` method with `ids` argument)

Example usage:

```python
from langchain.indexes import SQLRecordManager, index
from langchain.schema import Document
from langchain_community.vectorstores import Milvus
from langchain_openai import OpenAIEmbeddings

collection_name = "test_index"
embedding = OpenAIEmbeddings()
vectorstore = Milvus(embedding_function=embedding, collection_name=collection_name)

namespace = f"milvus/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()

doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

index(
    [doc1, doc1, doc2],
    record_manager,
    vectorstore,
    cleanup="incremental",  # None, "incremental", or "full"
    source_id_key="source",
)
```

# Fix issues

Fix https://github.com/milvus-io/milvus/issues/30112

---------

Signed-off-by: Jael Gu <mengjia.gu@zilliz.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-01-29 11:19:50 -08:00

197 lines
8.1 KiB
Python

from __future__ import annotations
import logging
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores.milvus import Milvus
logger = logging.getLogger(__name__)
class Zilliz(Milvus):
"""`Zilliz` vector store.
You need to have `pymilvus` installed and a
running Zilliz database.
See the following documentation for how to run a Zilliz instance:
https://docs.zilliz.com/docs/create-cluster
IF USING L2/IP metric IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA.
Args:
embedding_function (Embeddings): Function used to embed the text.
collection_name (str): Which Zilliz collection to use. Defaults to
"LangChainCollection".
connection_args (Optional[dict[str, any]]): The connection args used for
this class comes in the form of a dict.
consistency_level (str): The consistency level to use for a collection.
Defaults to "Session".
index_params (Optional[dict]): Which index params to use. Defaults to
HNSW/AUTOINDEX depending on service.
search_params (Optional[dict]): Which search params to use. Defaults to
default of index.
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
to False.
auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
If False, you needs to provide text ids (string less than 65535 bytes).
If True, Milvus will generate unique integers as primary keys.
The connection args used for this class comes in the form of a dict,
here are a few of the options:
address (str): The actual address of Zilliz
instance. Example address: "localhost:19530"
uri (str): The uri of Zilliz instance. Example uri:
"https://in03-ba4234asae.api.gcp-us-west1.zillizcloud.com",
host (str): The host of Zilliz instance. Default at "localhost",
PyMilvus will fill in the default host if only port is provided.
port (str/int): The port of Zilliz instance. Default at 19530, PyMilvus
will fill in the default port if only host is provided.
user (str): Use which user to connect to Zilliz instance. If user and
password are provided, we will add related header in every RPC call.
password (str): Required when user is provided. The password
corresponding to the user.
token (str): API key, for serverless clusters which can be used as
replacements for user and password.
secure (bool): Default is false. If set to true, tls will be enabled.
client_key_path (str): If use tls two-way authentication, need to
write the client.key path.
client_pem_path (str): If use tls two-way authentication, need to
write the client.pem path.
ca_pem_path (str): If use tls two-way authentication, need to write
the ca.pem path.
server_pem_path (str): If use tls one-way authentication, need to
write the server.pem path.
server_name (str): If use tls, need to write the common name.
Example:
.. code-block:: python
from langchain_community.vectorstores import Zilliz
from langchain_community.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
# Connect to a Zilliz instance
milvus_store = Milvus(
embedding_function = embedding,
collection_name = "LangChainCollection",
connection_args = {
"uri": "https://in03-ba4234asae.api.gcp-us-west1.zillizcloud.com",
"user": "temp",
"password": "temp",
"token": "temp", # API key as replacements for user and password
"secure": True
}
drop_old: True,
)
Raises:
ValueError: If the pymilvus python package is not installed.
"""
def _create_index(self) -> None:
"""Create a index on the collection"""
from pymilvus import Collection, MilvusException
if isinstance(self.col, Collection) and self._get_index() is None:
try:
# If no index params, use a default AutoIndex based one
if self.index_params is None:
self.index_params = {
"metric_type": "L2",
"index_type": "AUTOINDEX",
"params": {},
}
try:
self.col.create_index(
self._vector_field,
index_params=self.index_params,
using=self.alias,
)
# If default did not work, most likely Milvus self-hosted
except MilvusException:
# Use HNSW based index
self.index_params = {
"metric_type": "L2",
"index_type": "HNSW",
"params": {"M": 8, "efConstruction": 64},
}
self.col.create_index(
self._vector_field,
index_params=self.index_params,
using=self.alias,
)
logger.debug(
"Successfully created an index on collection: %s",
self.collection_name,
)
except MilvusException as e:
logger.error(
"Failed to create an index on collection: %s", self.collection_name
)
raise e
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection_name: str = "LangChainCollection",
connection_args: Optional[Dict[str, Any]] = None,
consistency_level: str = "Session",
index_params: Optional[dict] = None,
search_params: Optional[dict] = None,
drop_old: bool = False,
*,
ids: Optional[List[str]] = None,
auto_id: bool = False,
**kwargs: Any,
) -> Zilliz:
"""Create a Zilliz collection, indexes it with HNSW, and insert data.
Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
collection_name (str, optional): Collection name to use. Defaults to
"LangChainCollection".
connection_args (dict[str, Any], optional): Connection args to use. Defaults
to DEFAULT_MILVUS_CONNECTION.
consistency_level (str, optional): Which consistency level to use. Defaults
to "Session".
index_params (Optional[dict], optional): Which index_params to use.
Defaults to None.
search_params (Optional[dict], optional): Which search params to use.
Defaults to None.
drop_old (Optional[bool], optional): Whether to drop the collection with
that name if it exists. Defaults to False.
ids (Optional[List[str]]): List of text ids.
auto_id (bool): Whether to enable auto id for primary key. Defaults to
False. If False, you needs to provide text ids (string less than 65535
bytes). If True, Milvus will generate unique integers as primary keys.
Returns:
Zilliz: Zilliz Vector Store
"""
vector_db = cls(
embedding_function=embedding,
collection_name=collection_name,
connection_args=connection_args or {},
consistency_level=consistency_level,
index_params=index_params,
search_params=search_params,
drop_old=drop_old,
auto_id=auto_id,
**kwargs,
)
vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
return vector_db