From f78ae1d932edbf9bea1d65c369ca362691d0f2c7 Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Tue, 4 Jun 2024 00:13:57 +0200 Subject: [PATCH] docs: Astra DB vectorstore, add automatic-embedding example (#22350) Description: Adding an example showcasing the newly-introduced API-side embedding computation option for the Astra DB vector store --- .../integrations/vectorstores/astradb.ipynb | 131 +++++++++++++----- 1 file changed, 98 insertions(+), 33 deletions(-) diff --git a/docs/docs/integrations/vectorstores/astradb.ipynb b/docs/docs/integrations/vectorstores/astradb.ipynb index b185df23f1..a4f0c7c1bc 100644 --- a/docs/docs/integrations/vectorstores/astradb.ipynb +++ b/docs/docs/integrations/vectorstores/astradb.ipynb @@ -23,8 +23,6 @@ "id": "d2d6ca14-fb7e-4172-9aa0-a3119a064b96", "metadata": {}, "source": [ - "You'll need to install `langchain-community` with `pip install -qU langchain-community` to use this integration\n", - "\n", "_Note: in addition to access to the database, an OpenAI API Key is required to run the full example._" ] }, @@ -51,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "pip install --upgrade langchain-astradb" + "pip install -qU langchain-astradb" ] }, { @@ -59,7 +57,7 @@ "id": "2453d83a-bc8f-41e1-a692-befe4dd90156", "metadata": {}, "source": [ - "_**Note.** the following are all packages required to run the full demo on this page. Depending on your LangChain setup, some of them may need to be installed:_" + "_Make sure you have installed the packages required to run all of this demo:_" ] }, { @@ -69,7 +67,7 @@ "metadata": {}, "outputs": [], "source": [ - "pip install langchain langchain-openai datasets pypdf" + "pip install -qU langchain langchain-community langchain-openai datasets pypdf" ] }, { @@ -90,9 +88,8 @@ "import os\n", "from getpass import getpass\n", "\n", - "from datasets import (\n", - " load_dataset,\n", - ")\n", + "from astrapy.info import CollectionVectorServiceOptions\n", + "from datasets import load_dataset\n", "from langchain_community.document_loaders import PyPDFLoader\n", "from langchain_core.documents import Document\n", "from langchain_core.output_parsers import StrOutputParser\n", @@ -102,26 +99,6 @@ "from langchain_text_splitters import RecursiveCharacterTextSplitter" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "1983f1da-0ae7-4a9b-bf4c-4ade328f7a3a", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OPENAI_API_KEY = \")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c656df06-e938-4bc5-b570-440b8b7a0189", - "metadata": {}, - "outputs": [], - "source": [ - "embe = OpenAIEmbeddings()" - ] - }, { "cell_type": "markdown", "id": "22866f09-e10d-4f05-a24b-b9420129462e", @@ -145,7 +122,7 @@ "id": "68f61b01-3e09-47c1-9d67-5d6915c86626", "metadata": {}, "source": [ - "## Connection parameters\n", + "## DB Connection parameters\n", "\n", "These are found on your Astra DB dashboard:\n", "\n", @@ -173,7 +150,53 @@ }, { "cell_type": "markdown", - "id": "196268bd-a950-41c3-bede-f5b55f6a0804", + "id": "84a1fe85-a42c-4f15-92e1-f79f1dd43ea2", + "metadata": {}, + "source": [ + "## Create the vector store\n", + "\n", + "There are two ways to create an Astra DB vector store, which differ in how the embeddings are computed.\n", + "\n", + "*Explicit embeddings*. You can separately instantiate a `langchain_core.embeddings.Embeddings` class and pass it to the `AstraDBVectorStore` constructor, just like with most other LangChain vector stores.\n", + "\n", + "*Integrated embedding computation*. Alternatively, you can use the [Vectorize](https://www.datastax.com/blog/simplifying-vector-embedding-generation-with-astra-vectorize) feature of Astra DB and simply specify the name of a supported embedding model when creating the store. The embedding computations are entirely handled within the database. (To proceed with this method, you must have enabled the desired embedding integration for your database, as described [in the docs](https://docs.datastax.com/en/astra-db-serverless/databases/embedding-generation.html).)\n", + "\n", + "**Please choose one method and run the corresponding cells only.**" + ] + }, + { + "cell_type": "markdown", + "id": "8c435386-e8d5-41f4-a9e5-7b609ef781f9", + "metadata": {}, + "source": [ + "### Method 1: provide embeddings explicitly\n", + "\n", + "This demo will use an OpenAI embedding model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfa5c005-9738-4c53-b8a8-8540fcbb8bad", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OPENAI_API_KEY = \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3accae6f-73e2-483a-83f7-76eb33558a1f", + "metadata": {}, + "outputs": [], + "source": [ + "my_embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "465b1b16-5363-4c4f-9917-a49e02a86c14", "metadata": {}, "source": [ "Now you can create the vector store:" @@ -187,7 +210,7 @@ "outputs": [], "source": [ "vstore = AstraDBVectorStore(\n", - " embedding=embe,\n", + " embedding=my_embeddings,\n", " collection_name=\"astra_vector_demo\",\n", " api_endpoint=ASTRA_DB_API_ENDPOINT,\n", " token=ASTRA_DB_APPLICATION_TOKEN,\n", @@ -195,6 +218,46 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "5d5d2bfa-c071-4a5b-8b6e-3daa1b6de164", + "metadata": {}, + "source": [ + "### Method 2: use Astra Vectorize (embeddings integrated in Astra DB)\n", + "\n", + "Here it is assumed that you have\n", + "\n", + "- enabled the OpenAI integration in your Astra DB organization,\n", + "- added an API Key named `\"MY_OPENAI_API_KEY\"` to the integration, and\n", + "- scoped it to the database you are using.\n", + "\n", + "For more details please consult the [documentation](https://docs.datastax.com/en/astra-db-serverless/integrations/embedding-providers/openai.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d18455d-3fa6-4f9e-b687-3a2bc71c9a23", + "metadata": {}, + "outputs": [], + "source": [ + "openai_vectorize_options = CollectionVectorServiceOptions(\n", + " provider=\"openai\",\n", + " model_name=\"text-embedding-3-small\",\n", + " authentication={\n", + " \"providerKey\": \"MY_OPENAI_API_KEY.providerKey\",\n", + " },\n", + ")\n", + "\n", + "vstore = AstraDBVectorStore(\n", + " collection_name=\"astra_vectorize_demo\",\n", + " api_endpoint=ASTRA_DB_API_ENDPOINT,\n", + " token=ASTRA_DB_APPLICATION_TOKEN,\n", + " namespace=ASTRA_DB_KEYSPACE,\n", + " collection_vector_service_options=openai_vectorize_options,\n", + ")" + ] + }, { "cell_type": "markdown", "id": "9a348678-b2f6-46ca-9a0d-2eb4cc6b66b1", @@ -334,7 +397,9 @@ "id": "b14ea558-bfbe-41ce-807e-d70670060ada", "metadata": {}, "source": [ - "### MMR (Maximal-marginal-relevance) search" + "### MMR (Maximal-marginal-relevance) search\n", + "\n", + "_Note: the MMR search method is not (yet) supported for vector stores built with Astra Vectorize._" ] }, { @@ -537,7 +602,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.11.2" } }, "nbformat": 4,