docs: Improved deeplake.py init documentation (#17549)

**Description:** Updated documentation for DeepLake init method. Especially the exec_option docs needed improvement, but did a general cleanup while I was looking at it. **Issue:** n/a **Dependencies:** None --------- Co-authored-by: Nathan Voxland <nathan@voxland.net>
6 months ago · 9ece134d45
parent 29ee0496b6
commit 9ece134d45
1 changed files with 31 additions and 16 deletions
--- a/libs/community/langchain_community/vectorstores/deeplake.py
+++ b/libs/community/langchain_community/vectorstores/deeplake.py
@ -60,7 +60,7 @@ class DeepLake(VectorStore):
        embedding: Optional[Embeddings] = None,
        embedding_function: Optional[Embeddings] = None,
        read_only: bool = False,
-        ingestion_batch_size: int = 1000,
+        ingestion_batch_size: int = 1024,
        num_workers: int = 0,
        verbose: bool = True,
        exec_option: Optional[str] = None,
@ -85,8 +85,19 @@ class DeepLake(VectorStore):
            ... )
        Args:
-            dataset_path (str): Path to existing dataset or where to create
+            dataset_path (str): The full path for storing to the Deep Lake
-                a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
+                Vector Store. It can be:
                - a Deep Lake cloud path of the form ``hub://org_id/dataset_name``.
                    Requires registration with Deep Lake.
                - an s3 path of the form ``s3://bucketname/path/to/dataset``.
                    Credentials are required in either the environment or passed to
                    the creds argument.
                - a local file system path of the form ``./path/to/dataset``
                    or ``~/path/to/dataset`` or ``path/to/dataset``.
                - a memory path of the form ``mem://path/to/dataset`` which doesn't
                    save the dataset but keeps it in memory instead.
                    Should be used only for testing as it does not persist.
                    Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
            token (str, optional):  Activeloop token, for fetching credentials
                to the dataset at path if it is a Deep Lake dataset.
                Tokens are normally autogenerated. Optional.
@ -98,25 +109,29 @@ class DeepLake(VectorStore):
            read_only (bool): Open dataset in read-only mode. Default is False.
            ingestion_batch_size (int): During data ingestion, data is divided
                into batches. Batch size is the size of each batch.
-                Default is 1000.
+                Default is 1024.
            num_workers (int): Number of workers to use during data ingestion.
                Default is 0.
            verbose (bool): Print dataset summary after each operation.
                Default is True.
-            exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform
+            exec_option (str, optional): Default method for search execution.
-                searching - "python", "compute_engine", "tensor_db" and auto.
+                It could be either ``"auto"``, ``"python"``, ``"compute_engine"``
-                Default is None.
+                or ``"tensor_db"``. Defaults to ``"auto"``.
                If None, it's set to "auto".
                - ``auto``- Selects the best execution method based on the storage
                    location of the Vector Store. It is the default option.
-                - ``python`` - Pure-python implementation that runs on the client.
+                - ``python`` - Pure-python implementation that runs on the client and
-                    WARNING: using this with big datasets can lead to memory
+                    can be used for data stored anywhere. WARNING: using this option
-                    issues. Data can be stored anywhere.
+                    with big datasets is discouraged because it can lead to
-                - ``compute_engine`` - C++ implementation of the Deep Lake Compute
+                    memory issues.
-                    Engine that runs on the client. Can be used for any data stored in
+                - ``compute_engine`` - Performant C++ implementation of the Deep Lake
-                    or connected to Deep Lake. Not for in-memory or local datasets.
+                    Compute Engine that runs on the client and can be used for any data
-                - ``tensor_db`` - Hosted Managed Tensor Database that is
+                    stored in or connected to Deep Lake. It cannot be used with
-                    responsible for storage and query execution. Only for data stored in
+                    in-memory or local datasets.
-                    the Deep Lake Managed Database. Use runtime = {"db_engine": True}
+                - ``tensor_db`` - Performant and fully-hosted Managed Tensor Database
                    that is responsible for storage and query execution. Only available
                    for data stored in the Deep Lake Managed Database. Store datasets
                    in this database by specifying runtime = {"tensor_db": True}
                    during dataset creation.
            runtime (Dict, optional): Parameters for creating the Vector Store in
                Deep Lake's Managed Tensor Database. Not applicable when loading an