diff --git a/docs/docs/integrations/vectorstores/astradb.ipynb b/docs/docs/integrations/vectorstores/astradb.ipynb index 1777788e3a..fc3ebac563 100644 --- a/docs/docs/integrations/vectorstores/astradb.ipynb +++ b/docs/docs/integrations/vectorstores/astradb.ipynb @@ -44,7 +44,7 @@ "metadata": {}, "source": [ "_Note: depending on your LangChain setup, you may need to install/upgrade other dependencies needed for this demo_\n", - "_(specifically, recent versions of `datasets` `openai` `pypdf` and `tiktoken` are required)._" + "_(specifically, recent versions of `datasets`, `openai`, `pypdf` and `tiktoken` are required)._" ] }, { @@ -65,7 +65,6 @@ "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.prompts import ChatPromptTemplate\n", "\n", - "# if not present yet, run: pip install \"datasets==2.14.6\"\n", "from langchain.schema import Document\n", "from langchain.schema.output_parser import StrOutputParser\n", "from langchain.schema.runnable import RunnablePassthrough\n", @@ -145,7 +144,7 @@ "outputs": [], "source": [ "ASTRA_DB_API_ENDPOINT = input(\"ASTRA_DB_API_ENDPOINT = \")\n", - "ASTRA_DB_TOKEN = getpass(\"ASTRA_DB_TOKEN = \")" + "ASTRA_DB_APPLICATION_TOKEN = getpass(\"ASTRA_DB_APPLICATION_TOKEN = \")" ] }, { @@ -159,7 +158,7 @@ " embedding=embe,\n", " collection_name=\"astra_vector_demo\",\n", " api_endpoint=ASTRA_DB_API_ENDPOINT,\n", - " token=ASTRA_DB_TOKEN,\n", + " token=ASTRA_DB_APPLICATION_TOKEN,\n", ")" ] }, @@ -171,6 +170,14 @@ "### Load a dataset" ] }, + { + "cell_type": "markdown", + "id": "552e56b0-301a-4b06-99c7-57ba6faa966f", + "metadata": {}, + "source": [ + "Convert each entry in the source dataset into a `Document`, then write them into the vector store:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -190,6 +197,16 @@ "print(f\"\\nInserted {len(inserted_ids)} documents.\")" ] }, + { + "cell_type": "markdown", + "id": "79d4f436-ef04-4288-8f79-97c9abb983ed", + "metadata": {}, + "source": [ + "In the above, `metadata` dictionaries are created from the source data and are part of the `Document`.\n", + "\n", + "_Note: check the [Astra DB API Docs](https://docs.datastax.com/en/astra-serverless/docs/develop/dev-with-json.html#_json_api_limits) for the valid metadata field names: some characters are reserved and cannot be used._" + ] + }, { "cell_type": "markdown", "id": "084d8802-ab39-4262-9a87-42eafb746f92", @@ -213,6 +230,16 @@ "print(f\"\\nInserted {len(inserted_ids_2)} documents.\")" ] }, + { + "cell_type": "markdown", + "id": "63840eb3-8b29-4017-bc2f-301bf5001f28", + "metadata": {}, + "source": [ + "_Note: you may want to speed up the execution of `add_texts` and `add_documents` by increasing the concurrency level for_\n", + "_these bulk operations - check out the `*_concurrency` parameters in the class constructor and the `add_texts` docstrings_\n", + "_for more details. Depending on the network and the client machine specifications, your best-performing choice of parameters may vary._" + ] + }, { "cell_type": "markdown", "id": "c031760a-1fc5-4855-adf2-02ed52fe2181", @@ -625,7 +652,7 @@ "outputs": [], "source": [ "ASTRA_DB_ID = input(\"ASTRA_DB_ID = \")\n", - "ASTRA_DB_TOKEN = getpass(\"ASTRA_DB_TOKEN = \")\n", + "ASTRA_DB_APPLICATION_TOKEN = getpass(\"ASTRA_DB_APPLICATION_TOKEN = \")\n", "\n", "desired_keyspace = input(\"ASTRA_DB_KEYSPACE (optional, can be left empty) = \")\n", "if desired_keyspace:\n", @@ -645,7 +672,7 @@ "\n", "cassio.init(\n", " database_id=ASTRA_DB_ID,\n", - " token=ASTRA_DB_TOKEN,\n", + " token=ASTRA_DB_APPLICATION_TOKEN,\n", " keyspace=ASTRA_DB_KEYSPACE,\n", ")" ] diff --git a/libs/langchain/langchain/vectorstores/astradb.py b/libs/langchain/langchain/vectorstores/astradb.py index ef36065ed0..ed972a479c 100644 --- a/libs/langchain/langchain/vectorstores/astradb.py +++ b/libs/langchain/langchain/vectorstores/astradb.py @@ -34,7 +34,7 @@ DocDict = Dict[str, Any] # dicts expressing entries to insert # (20 is the max batch size for the HTTP API at the time of writing) DEFAULT_BATCH_SIZE = 20 # Number of threads to insert batches concurrently: -DEFAULT_BULK_INSERT_BATCH_CONCURRENCY = 5 +DEFAULT_BULK_INSERT_BATCH_CONCURRENCY = 16 # Number of threads in a batch to insert pre-existing entries: DEFAULT_BULK_INSERT_OVERWRITE_CONCURRENCY = 10 # Number of threads (for deleting multiple rows concurrently): @@ -139,6 +139,20 @@ class AstraDB(VectorStore): threads in a batch to insert pre-existing entries. bulk_delete_concurrency (Optional[int]): Number of threads (for deleting multiple rows concurrently). + + A note on concurrency: as a rule of thumb, on a typical client machine + it is suggested to keep the quantity + bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency + much below 1000 to avoid exhausting the client multithreading/networking + resources. The hardcoded defaults are somewhat conservative to meet + most machines' specs, but a sensible choice to test may be: + bulk_insert_batch_concurrency = 80 + bulk_insert_overwrite_concurrency = 10 + A bit of experimentation is required to nail the best results here, + depending on both the machine/network specs and the expected workload + (specifically, how often a write is an update of an existing id). + Remember you can pass concurrency settings to individual calls to + add_texts and add_documents as well. """ # Conflicting-arg checks: @@ -330,6 +344,12 @@ class AstraDB(VectorStore): pre-existing documents in each batch (which require individual API calls). Defaults to instance-level setting if not provided. + A note on metadata: there are constraints on the allowed field names + in this dictionary, coming from the underlying Astra DB API. + For instance, the `$` (dollar sign) cannot be used in the dict keys. + See this document for details: + docs.datastax.com/en/astra-serverless/docs/develop/dev-with-json.html + Returns: List[str]: List of ids of the added texts. """