docs: Improved deeplake.py init documentation (#17549)

**Description:** Updated documentation for DeepLake init method. Especially the exec_option docs needed improvement, but did a general cleanup while I was looking at it. **Issue:** n/a **Dependencies:** None --------- Co-authored-by: Nathan Voxland <nathan@voxland.net>
8 months ago · 9ece134d45
parent 29ee0496b6
commit 9ece134d45
1 changed files with 31 additions and 16 deletions
--- a/libs/community/langchain_community/vectorstores/deeplake.py
+++ b/libs/community/langchain_community/vectorstores/deeplake.py
@ -60,7 +60,7 @@ class DeepLake(VectorStore):
        embedding: Optional[Embeddings] = None,
        embedding_function: Optional[Embeddings] = None,
        read_only: bool = False,
-        ingestion_batch_size: int = 1000,
+        ingestion_batch_size: int = 1024,
        num_workers: int = 0,
        verbose: bool = True,
        exec_option: Optional[str] = None,
@ -85,8 +85,19 @@ class DeepLake(VectorStore):
            ... )

        Args:
-            dataset_path (str): Path to existing dataset or where to create
-                a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
+            dataset_path (str): The full path for storing to the Deep Lake
+                Vector Store. It can be:
+                - a Deep Lake cloud path of the form ``hub://org_id/dataset_name``.
+                    Requires registration with Deep Lake.
+                - an s3 path of the form ``s3://bucketname/path/to/dataset``.
+                    Credentials are required in either the environment or passed to
+                    the creds argument.
+                - a local file system path of the form ``./path/to/dataset``
+                    or ``~/path/to/dataset`` or ``path/to/dataset``.
+                - a memory path of the form ``mem://path/to/dataset`` which doesn't
+                    save the dataset but keeps it in memory instead.
+                    Should be used only for testing as it does not persist.
+                    Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH.
            token (str, optional):  Activeloop token, for fetching credentials
                to the dataset at path if it is a Deep Lake dataset.
                Tokens are normally autogenerated. Optional.
@ -98,25 +109,29 @@ class DeepLake(VectorStore):
            read_only (bool): Open dataset in read-only mode. Default is False.
            ingestion_batch_size (int): During data ingestion, data is divided
                into batches. Batch size is the size of each batch.
-                Default is 1000.
+                Default is 1024.
            num_workers (int): Number of workers to use during data ingestion.
                Default is 0.
            verbose (bool): Print dataset summary after each operation.
                Default is True.
-            exec_option (str, optional): DeepLakeVectorStore supports 3 ways to perform
-                searching - "python", "compute_engine", "tensor_db" and auto.
-                Default is None.
+            exec_option (str, optional): Default method for search execution.
+                It could be either ``"auto"``, ``"python"``, ``"compute_engine"``
+                or ``"tensor_db"``. Defaults to ``"auto"``.
+                If None, it's set to "auto".
                - ``auto``- Selects the best execution method based on the storage
                    location of the Vector Store. It is the default option.
-                - ``python`` - Pure-python implementation that runs on the client.
-                    WARNING: using this with big datasets can lead to memory
-                    issues. Data can be stored anywhere.
-                - ``compute_engine`` - C++ implementation of the Deep Lake Compute
-                    Engine that runs on the client. Can be used for any data stored in
-                    or connected to Deep Lake. Not for in-memory or local datasets.
-                - ``tensor_db`` - Hosted Managed Tensor Database that is
-                    responsible for storage and query execution. Only for data stored in
-                    the Deep Lake Managed Database. Use runtime = {"db_engine": True}
+                - ``python`` - Pure-python implementation that runs on the client and
+                    can be used for data stored anywhere. WARNING: using this option
+                    with big datasets is discouraged because it can lead to
+                    memory issues.
+                - ``compute_engine`` - Performant C++ implementation of the Deep Lake
+                    Compute Engine that runs on the client and can be used for any data
+                    stored in or connected to Deep Lake. It cannot be used with
+                    in-memory or local datasets.
+                - ``tensor_db`` - Performant and fully-hosted Managed Tensor Database
+                    that is responsible for storage and query execution. Only available
+                    for data stored in the Deep Lake Managed Database. Store datasets
+                    in this database by specifying runtime = {"tensor_db": True}
                    during dataset creation.
            runtime (Dict, optional): Parameters for creating the Vector Store in
                Deep Lake's Managed Tensor Database. Not applicable when loading an