mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
186 lines
7.4 KiB
Python
186 lines
7.4 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
import logging
|
||
|
from typing import Any, Dict, List, Optional
|
||
|
|
||
|
from langchain_core.embeddings import Embeddings
|
||
|
|
||
|
from langchain_community.vectorstores.milvus import Milvus
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class Zilliz(Milvus):
|
||
|
"""`Zilliz` vector store.
|
||
|
|
||
|
You need to have `pymilvus` installed and a
|
||
|
running Zilliz database.
|
||
|
|
||
|
See the following documentation for how to run a Zilliz instance:
|
||
|
https://docs.zilliz.com/docs/create-cluster
|
||
|
|
||
|
|
||
|
IF USING L2/IP metric IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA.
|
||
|
|
||
|
Args:
|
||
|
embedding_function (Embeddings): Function used to embed the text.
|
||
|
collection_name (str): Which Zilliz collection to use. Defaults to
|
||
|
"LangChainCollection".
|
||
|
connection_args (Optional[dict[str, any]]): The connection args used for
|
||
|
this class comes in the form of a dict.
|
||
|
consistency_level (str): The consistency level to use for a collection.
|
||
|
Defaults to "Session".
|
||
|
index_params (Optional[dict]): Which index params to use. Defaults to
|
||
|
HNSW/AUTOINDEX depending on service.
|
||
|
search_params (Optional[dict]): Which search params to use. Defaults to
|
||
|
default of index.
|
||
|
drop_old (Optional[bool]): Whether to drop the current collection. Defaults
|
||
|
to False.
|
||
|
|
||
|
The connection args used for this class comes in the form of a dict,
|
||
|
here are a few of the options:
|
||
|
address (str): The actual address of Zilliz
|
||
|
instance. Example address: "localhost:19530"
|
||
|
uri (str): The uri of Zilliz instance. Example uri:
|
||
|
"https://in03-ba4234asae.api.gcp-us-west1.zillizcloud.com",
|
||
|
host (str): The host of Zilliz instance. Default at "localhost",
|
||
|
PyMilvus will fill in the default host if only port is provided.
|
||
|
port (str/int): The port of Zilliz instance. Default at 19530, PyMilvus
|
||
|
will fill in the default port if only host is provided.
|
||
|
user (str): Use which user to connect to Zilliz instance. If user and
|
||
|
password are provided, we will add related header in every RPC call.
|
||
|
password (str): Required when user is provided. The password
|
||
|
corresponding to the user.
|
||
|
token (str): API key, for serverless clusters which can be used as
|
||
|
replacements for user and password.
|
||
|
secure (bool): Default is false. If set to true, tls will be enabled.
|
||
|
client_key_path (str): If use tls two-way authentication, need to
|
||
|
write the client.key path.
|
||
|
client_pem_path (str): If use tls two-way authentication, need to
|
||
|
write the client.pem path.
|
||
|
ca_pem_path (str): If use tls two-way authentication, need to write
|
||
|
the ca.pem path.
|
||
|
server_pem_path (str): If use tls one-way authentication, need to
|
||
|
write the server.pem path.
|
||
|
server_name (str): If use tls, need to write the common name.
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.vectorstores import Zilliz
|
||
|
from langchain_community.embeddings import OpenAIEmbeddings
|
||
|
|
||
|
embedding = OpenAIEmbeddings()
|
||
|
# Connect to a Zilliz instance
|
||
|
milvus_store = Milvus(
|
||
|
embedding_function = embedding,
|
||
|
collection_name = "LangChainCollection",
|
||
|
connection_args = {
|
||
|
"uri": "https://in03-ba4234asae.api.gcp-us-west1.zillizcloud.com",
|
||
|
"user": "temp",
|
||
|
"password": "temp",
|
||
|
"token": "temp", # API key as replacements for user and password
|
||
|
"secure": True
|
||
|
}
|
||
|
drop_old: True,
|
||
|
)
|
||
|
|
||
|
Raises:
|
||
|
ValueError: If the pymilvus python package is not installed.
|
||
|
"""
|
||
|
|
||
|
def _create_index(self) -> None:
|
||
|
"""Create a index on the collection"""
|
||
|
from pymilvus import Collection, MilvusException
|
||
|
|
||
|
if isinstance(self.col, Collection) and self._get_index() is None:
|
||
|
try:
|
||
|
# If no index params, use a default AutoIndex based one
|
||
|
if self.index_params is None:
|
||
|
self.index_params = {
|
||
|
"metric_type": "L2",
|
||
|
"index_type": "AUTOINDEX",
|
||
|
"params": {},
|
||
|
}
|
||
|
|
||
|
try:
|
||
|
self.col.create_index(
|
||
|
self._vector_field,
|
||
|
index_params=self.index_params,
|
||
|
using=self.alias,
|
||
|
)
|
||
|
|
||
|
# If default did not work, most likely Milvus self-hosted
|
||
|
except MilvusException:
|
||
|
# Use HNSW based index
|
||
|
self.index_params = {
|
||
|
"metric_type": "L2",
|
||
|
"index_type": "HNSW",
|
||
|
"params": {"M": 8, "efConstruction": 64},
|
||
|
}
|
||
|
self.col.create_index(
|
||
|
self._vector_field,
|
||
|
index_params=self.index_params,
|
||
|
using=self.alias,
|
||
|
)
|
||
|
logger.debug(
|
||
|
"Successfully created an index on collection: %s",
|
||
|
self.collection_name,
|
||
|
)
|
||
|
|
||
|
except MilvusException as e:
|
||
|
logger.error(
|
||
|
"Failed to create an index on collection: %s", self.collection_name
|
||
|
)
|
||
|
raise e
|
||
|
|
||
|
@classmethod
|
||
|
def from_texts(
|
||
|
cls,
|
||
|
texts: List[str],
|
||
|
embedding: Embeddings,
|
||
|
metadatas: Optional[List[dict]] = None,
|
||
|
collection_name: str = "LangChainCollection",
|
||
|
connection_args: Optional[Dict[str, Any]] = None,
|
||
|
consistency_level: str = "Session",
|
||
|
index_params: Optional[dict] = None,
|
||
|
search_params: Optional[dict] = None,
|
||
|
drop_old: bool = False,
|
||
|
**kwargs: Any,
|
||
|
) -> Zilliz:
|
||
|
"""Create a Zilliz collection, indexes it with HNSW, and insert data.
|
||
|
|
||
|
Args:
|
||
|
texts (List[str]): Text data.
|
||
|
embedding (Embeddings): Embedding function.
|
||
|
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
|
||
|
Defaults to None.
|
||
|
collection_name (str, optional): Collection name to use. Defaults to
|
||
|
"LangChainCollection".
|
||
|
connection_args (dict[str, Any], optional): Connection args to use. Defaults
|
||
|
to DEFAULT_MILVUS_CONNECTION.
|
||
|
consistency_level (str, optional): Which consistency level to use. Defaults
|
||
|
to "Session".
|
||
|
index_params (Optional[dict], optional): Which index_params to use.
|
||
|
Defaults to None.
|
||
|
search_params (Optional[dict], optional): Which search params to use.
|
||
|
Defaults to None.
|
||
|
drop_old (Optional[bool], optional): Whether to drop the collection with
|
||
|
that name if it exists. Defaults to False.
|
||
|
|
||
|
Returns:
|
||
|
Zilliz: Zilliz Vector Store
|
||
|
"""
|
||
|
vector_db = cls(
|
||
|
embedding_function=embedding,
|
||
|
collection_name=collection_name,
|
||
|
connection_args=connection_args or {},
|
||
|
consistency_level=consistency_level,
|
||
|
index_params=index_params,
|
||
|
search_params=search_params,
|
||
|
drop_old=drop_old,
|
||
|
**kwargs,
|
||
|
)
|
||
|
vector_db.add_texts(texts=texts, metadatas=metadatas)
|
||
|
return vector_db
|