diff --git a/libs/community/langchain_community/vectorstores/deeplake.py b/libs/community/langchain_community/vectorstores/deeplake.py index 659c24c6ca..7240d166f6 100644 --- a/libs/community/langchain_community/vectorstores/deeplake.py +++ b/libs/community/langchain_community/vectorstores/deeplake.py @@ -60,7 +60,7 @@ class DeepLake(VectorStore): embedding: Optional[Embeddings] = None, embedding_function: Optional[Embeddings] = None, read_only: bool = False, - ingestion_batch_size: int = 1000, + ingestion_batch_size: int = 1024, num_workers: int = 0, verbose: bool = True, exec_option: Optional[str] = None, @@ -85,8 +85,19 @@ class DeepLake(VectorStore): ... ) Args: - dataset_path (str): Path to existing dataset or where to create - a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH. + dataset_path (str): The full path for storing to the Deep Lake + Vector Store. It can be: + - a Deep Lake cloud path of the form ``hub://org_id/dataset_name``. + Requires registration with Deep Lake. + - an s3 path of the form ``s3://bucketname/path/to/dataset``. + Credentials are required in either the environment or passed to + the creds argument. + - a local file system path of the form ``./path/to/dataset`` + or ``~/path/to/dataset`` or ``path/to/dataset``. + - a memory path of the form ``mem://path/to/dataset`` which doesn't + save the dataset but keeps it in memory instead. + Should be used only for testing as it does not persist. + Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH. token (str, optional): Activeloop token, for fetching credentials to the dataset at path if it is a Deep Lake dataset. Tokens are normally autogenerated. Optional. @@ -98,25 +109,29 @@ class DeepLake(VectorStore): read_only (bool): Open dataset in read-only mode. Default is False. ingestion_batch_size (int): During data ingestion, data is divided into batches. Batch size is the size of each batch. - Default is 1000. + Default is 1024. num_workers (int): Number of workers to use during data ingestion. Default is 0. verbose (bool): Print dataset summary after each operation. Default is True. - exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform - searching - "python", "compute_engine", "tensor_db" and auto. - Default is None. + exec_option (str, optional): Default method for search execution. + It could be either ``"auto"``, ``"python"``, ``"compute_engine"`` + or ``"tensor_db"``. Defaults to ``"auto"``. + If None, it's set to "auto". - ``auto``- Selects the best execution method based on the storage location of the Vector Store. It is the default option. - - ``python`` - Pure-python implementation that runs on the client. - WARNING: using this with big datasets can lead to memory - issues. Data can be stored anywhere. - - ``compute_engine`` - C++ implementation of the Deep Lake Compute - Engine that runs on the client. Can be used for any data stored in - or connected to Deep Lake. Not for in-memory or local datasets. - - ``tensor_db`` - Hosted Managed Tensor Database that is - responsible for storage and query execution. Only for data stored in - the Deep Lake Managed Database. Use runtime = {"db_engine": True} + - ``python`` - Pure-python implementation that runs on the client and + can be used for data stored anywhere. WARNING: using this option + with big datasets is discouraged because it can lead to + memory issues. + - ``compute_engine`` - Performant C++ implementation of the Deep Lake + Compute Engine that runs on the client and can be used for any data + stored in or connected to Deep Lake. It cannot be used with + in-memory or local datasets. + - ``tensor_db`` - Performant and fully-hosted Managed Tensor Database + that is responsible for storage and query execution. Only available + for data stored in the Deep Lake Managed Database. Store datasets + in this database by specifying runtime = {"tensor_db": True} during dataset creation. runtime (Dict, optional): Parameters for creating the Vector Store in Deep Lake's Managed Tensor Database. Not applicable when loading an