mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
a1aa3a657c
# Description To support [langchain indexing](https://python.langchain.com/docs/modules/data_connection/indexing) as requested by users, vectorstore Milvus needs to support: - document addition by id (`add_documents` method with `ids` argument) - delete by id (`delete` method with `ids` argument) Example usage: ```python from langchain.indexes import SQLRecordManager, index from langchain.schema import Document from langchain_community.vectorstores import Milvus from langchain_openai import OpenAIEmbeddings collection_name = "test_index" embedding = OpenAIEmbeddings() vectorstore = Milvus(embedding_function=embedding, collection_name=collection_name) namespace = f"milvus/{collection_name}" record_manager = SQLRecordManager( namespace, db_url="sqlite:///record_manager_cache.sql" ) record_manager.create_schema() doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"}) doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"}) index( [doc1, doc1, doc2], record_manager, vectorstore, cleanup="incremental", # None, "incremental", or "full" source_id_key="source", ) ``` # Fix issues Fix https://github.com/milvus-io/milvus/issues/30112 --------- Signed-off-by: Jael Gu <mengjia.gu@zilliz.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
197 lines
8.1 KiB
Python
197 lines
8.1 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from langchain_core.embeddings import Embeddings
|
|
|
|
from langchain_community.vectorstores.milvus import Milvus
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Zilliz(Milvus):
|
|
"""`Zilliz` vector store.
|
|
|
|
You need to have `pymilvus` installed and a
|
|
running Zilliz database.
|
|
|
|
See the following documentation for how to run a Zilliz instance:
|
|
https://docs.zilliz.com/docs/create-cluster
|
|
|
|
|
|
IF USING L2/IP metric IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA.
|
|
|
|
Args:
|
|
embedding_function (Embeddings): Function used to embed the text.
|
|
collection_name (str): Which Zilliz collection to use. Defaults to
|
|
"LangChainCollection".
|
|
connection_args (Optional[dict[str, any]]): The connection args used for
|
|
this class comes in the form of a dict.
|
|
consistency_level (str): The consistency level to use for a collection.
|
|
Defaults to "Session".
|
|
index_params (Optional[dict]): Which index params to use. Defaults to
|
|
HNSW/AUTOINDEX depending on service.
|
|
search_params (Optional[dict]): Which search params to use. Defaults to
|
|
default of index.
|
|
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
|
|
to False.
|
|
auto_id (bool): Whether to enable auto id for primary key. Defaults to False.
|
|
If False, you needs to provide text ids (string less than 65535 bytes).
|
|
If True, Milvus will generate unique integers as primary keys.
|
|
|
|
The connection args used for this class comes in the form of a dict,
|
|
here are a few of the options:
|
|
address (str): The actual address of Zilliz
|
|
instance. Example address: "localhost:19530"
|
|
uri (str): The uri of Zilliz instance. Example uri:
|
|
"https://in03-ba4234asae.api.gcp-us-west1.zillizcloud.com",
|
|
host (str): The host of Zilliz instance. Default at "localhost",
|
|
PyMilvus will fill in the default host if only port is provided.
|
|
port (str/int): The port of Zilliz instance. Default at 19530, PyMilvus
|
|
will fill in the default port if only host is provided.
|
|
user (str): Use which user to connect to Zilliz instance. If user and
|
|
password are provided, we will add related header in every RPC call.
|
|
password (str): Required when user is provided. The password
|
|
corresponding to the user.
|
|
token (str): API key, for serverless clusters which can be used as
|
|
replacements for user and password.
|
|
secure (bool): Default is false. If set to true, tls will be enabled.
|
|
client_key_path (str): If use tls two-way authentication, need to
|
|
write the client.key path.
|
|
client_pem_path (str): If use tls two-way authentication, need to
|
|
write the client.pem path.
|
|
ca_pem_path (str): If use tls two-way authentication, need to write
|
|
the ca.pem path.
|
|
server_pem_path (str): If use tls one-way authentication, need to
|
|
write the server.pem path.
|
|
server_name (str): If use tls, need to write the common name.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.vectorstores import Zilliz
|
|
from langchain_community.embeddings import OpenAIEmbeddings
|
|
|
|
embedding = OpenAIEmbeddings()
|
|
# Connect to a Zilliz instance
|
|
milvus_store = Milvus(
|
|
embedding_function = embedding,
|
|
collection_name = "LangChainCollection",
|
|
connection_args = {
|
|
"uri": "https://in03-ba4234asae.api.gcp-us-west1.zillizcloud.com",
|
|
"user": "temp",
|
|
"password": "temp",
|
|
"token": "temp", # API key as replacements for user and password
|
|
"secure": True
|
|
}
|
|
drop_old: True,
|
|
)
|
|
|
|
Raises:
|
|
ValueError: If the pymilvus python package is not installed.
|
|
"""
|
|
|
|
def _create_index(self) -> None:
|
|
"""Create a index on the collection"""
|
|
from pymilvus import Collection, MilvusException
|
|
|
|
if isinstance(self.col, Collection) and self._get_index() is None:
|
|
try:
|
|
# If no index params, use a default AutoIndex based one
|
|
if self.index_params is None:
|
|
self.index_params = {
|
|
"metric_type": "L2",
|
|
"index_type": "AUTOINDEX",
|
|
"params": {},
|
|
}
|
|
|
|
try:
|
|
self.col.create_index(
|
|
self._vector_field,
|
|
index_params=self.index_params,
|
|
using=self.alias,
|
|
)
|
|
|
|
# If default did not work, most likely Milvus self-hosted
|
|
except MilvusException:
|
|
# Use HNSW based index
|
|
self.index_params = {
|
|
"metric_type": "L2",
|
|
"index_type": "HNSW",
|
|
"params": {"M": 8, "efConstruction": 64},
|
|
}
|
|
self.col.create_index(
|
|
self._vector_field,
|
|
index_params=self.index_params,
|
|
using=self.alias,
|
|
)
|
|
logger.debug(
|
|
"Successfully created an index on collection: %s",
|
|
self.collection_name,
|
|
)
|
|
|
|
except MilvusException as e:
|
|
logger.error(
|
|
"Failed to create an index on collection: %s", self.collection_name
|
|
)
|
|
raise e
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls,
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[dict]] = None,
|
|
collection_name: str = "LangChainCollection",
|
|
connection_args: Optional[Dict[str, Any]] = None,
|
|
consistency_level: str = "Session",
|
|
index_params: Optional[dict] = None,
|
|
search_params: Optional[dict] = None,
|
|
drop_old: bool = False,
|
|
*,
|
|
ids: Optional[List[str]] = None,
|
|
auto_id: bool = False,
|
|
**kwargs: Any,
|
|
) -> Zilliz:
|
|
"""Create a Zilliz collection, indexes it with HNSW, and insert data.
|
|
|
|
Args:
|
|
texts (List[str]): Text data.
|
|
embedding (Embeddings): Embedding function.
|
|
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
|
|
Defaults to None.
|
|
collection_name (str, optional): Collection name to use. Defaults to
|
|
"LangChainCollection".
|
|
connection_args (dict[str, Any], optional): Connection args to use. Defaults
|
|
to DEFAULT_MILVUS_CONNECTION.
|
|
consistency_level (str, optional): Which consistency level to use. Defaults
|
|
to "Session".
|
|
index_params (Optional[dict], optional): Which index_params to use.
|
|
Defaults to None.
|
|
search_params (Optional[dict], optional): Which search params to use.
|
|
Defaults to None.
|
|
drop_old (Optional[bool], optional): Whether to drop the collection with
|
|
that name if it exists. Defaults to False.
|
|
ids (Optional[List[str]]): List of text ids.
|
|
auto_id (bool): Whether to enable auto id for primary key. Defaults to
|
|
False. If False, you needs to provide text ids (string less than 65535
|
|
bytes). If True, Milvus will generate unique integers as primary keys.
|
|
|
|
Returns:
|
|
Zilliz: Zilliz Vector Store
|
|
"""
|
|
vector_db = cls(
|
|
embedding_function=embedding,
|
|
collection_name=collection_name,
|
|
connection_args=connection_args or {},
|
|
consistency_level=consistency_level,
|
|
index_params=index_params,
|
|
search_params=search_params,
|
|
drop_old=drop_old,
|
|
auto_id=auto_id,
|
|
**kwargs,
|
|
)
|
|
vector_db.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
|
return vector_db
|