Pushing updated Vector DB introduction notebook with PR changes

1 year ago · f2567b62a5
parent 73802cb7e1
commit f2567b62a5
1 changed files with 152 additions and 311 deletions
--- a/examples/vector_databases/Vector_db_introduction.ipynb
+++ b/examples/vector_databases/Vector_db_introduction.ipynb
@ -36,7 +36,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 98,
   "id": "5be94df6",
   "metadata": {},
   "outputs": [],
@ -60,7 +60,13 @@
    "import weaviate\n",
    "\n",
    "# I've set this to our new embeddings model, this can be changed to the embedding model of your choice\n",
-    "MODEL = \"text-embedding-ada-002\""
+    "MODEL = \"text-embedding-ada-002\"\n",
+    "\n",
+    "# Ignore unclosed SSL socket warnings - optional in case you get these errors\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(action=\"ignore\", message=\"unclosed\", category=ResourceWarning)\n",
+    "warnings.filterwarnings(\"ignore\", category=DeprecationWarning) "
   ]
  },
  {
@ -77,13 +83,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 116,
   "id": "bd99e08e",
   "metadata": {},
   "outputs": [],
   "source": [
-    "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
-    "\n",
    "# Simple function to take in a list of text objects and return them as a list of embeddings\n",
    "def get_embeddings(input: List):\n",
    "    response = openai.Embedding.create(\n",
@ -92,17 +96,19 @@
    "    )[\"data\"]\n",
    "    return [data[\"embedding\"] for data in response]\n",
    "\n",
+    "def batchify(iterable, n=1):\n",
+    "    l = len(iterable)\n",
+    "    for ndx in range(0, l, n):\n",
+    "        yield iterable[ndx : min(ndx + n, l)]\n",
+    "\n",
    "# Function for batching and parallel processing the embeddings\n",
+    "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
    "def embed_corpus(\n",
    "    corpus: List[str],\n",
    "    batch_size=64,\n",
    "    num_workers=8,\n",
    "    max_context_len=8191,\n",
    "):\n",
-    "    def batchify(iterable, n=1):\n",
-    "        l = len(iterable)\n",
-    "        for ndx in range(0, l, n):\n",
-    "            yield iterable[ndx : min(ndx + n, l)]\n",
    "\n",
    "    # Encode the corpus, truncating to max_context_len\n",
    "    encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
@ -119,50 +125,37 @@
    "\n",
    "    # Embed the corpus\n",
    "    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
-    "        futures = [\n",
-    "            executor.submit(get_embeddings, text_batch)\n",
-    "            for text_batch in batchify(encoded_corpus, batch_size)\n",
-    "        ]\n",
+    "        \n",
+    "        try:\n",
+    "            futures = [\n",
+    "                executor.submit(get_embeddings, text_batch)\n",
+    "                for text_batch in batchify(encoded_corpus, batch_size)\n",
+    "            ]\n",
    "\n",
-    "        with tqdm(total=len(encoded_corpus)) as pbar:\n",
-    "            for _ in concurrent.futures.as_completed(futures):\n",
-    "                pbar.update(batch_size)\n",
+    "            with tqdm(total=len(encoded_corpus)) as pbar:\n",
+    "                for _ in concurrent.futures.as_completed(futures):\n",
+    "                    pbar.update(batch_size)\n",
    "\n",
-    "        embeddings = []\n",
-    "        for future in futures:\n",
-    "            data = future.result()\n",
-    "            embeddings.extend(data)\n",
-    "        return embeddings"
+    "            embeddings = []\n",
+    "            for future in futures:\n",
+    "                data = future.result()\n",
+    "                embeddings.extend(data)\n",
+    "                \n",
+    "            return embeddings\n",
+    "                \n",
+    "        except Exception as e:\n",
+    "            print('Get embeddings failed, returning exception')\n",
+    "            \n",
+    "            return e\n",
+    "        "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "id": "0c1c73cb",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset wikipedia (/Users/colin.jarvis/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "edbff2615b964463be20d0a2ac33e4ab",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# We'll use the datasets library to pull the Simple Wikipedia dataset for embedding\n",
    "dataset = list(load_dataset(\"wikipedia\", \"20220301.simple\")[\"train\"])\n",
@ -172,7 +165,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 118,
   "id": "e6ee90ce",
   "metadata": {},
   "outputs": [
@ -187,7 +180,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "50048it [03:05, 269.52it/s]                                                                                                                                                                       \n"
+      "50048it [02:30, 332.26it/s]                                                                                                                                                      \n"
     ]
    },
    {
@ -201,15 +194,15 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "50048it [00:52, 957.36it/s]                                                                                                                                                                       "
+      "50048it [00:53, 942.94it/s]                                                                                                                                                      "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "CPU times: user 42.3 s, sys: 8.47 s, total: 50.8 s\n",
-      "Wall time: 4min 5s\n"
+      "CPU times: user 48.7 s, sys: 1min 19s, total: 2min 7s\n",
+      "Wall time: 5min 53s\n"
     ]
    },
    {
@ -230,7 +223,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 119,
   "id": "1410daaa",
   "metadata": {},
   "outputs": [
@ -349,7 +342,7 @@
       "4  [0.021524671465158463, 0.018522677943110466, -...         4  "
      ]
     },
-     "execution_count": 13,
+     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -383,7 +376,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 93,
   "id": "92e6152a",
   "metadata": {},
   "outputs": [],
@ -404,28 +397,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 94,
   "id": "0a71c575",
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Models a simple batch generator that make chunks out of an input DataFrame\n",
    "class BatchGenerator:\n",
-    "    \"\"\" Models a simple batch generator that make chunks out of an input DataFrame. \"\"\"\n",
+    "    \n",
    "    \n",
    "    def __init__(self, batch_size: int = 10) -> None:\n",
    "        self.batch_size = batch_size\n",
    "    \n",
+    "    # Makes chunks out of an input DataFrame\n",
    "    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:\n",
-    "        \"\"\" Makes chunks out of an input DataFrame. \"\"\"\n",
    "        splits = self.splits_num(df.shape[0])\n",
    "        if splits <= 1:\n",
    "            yield df\n",
    "        else:\n",
    "            for chunk in np.array_split(df, splits):\n",
    "                yield chunk\n",
-    "    \n",
+    "\n",
+    "    # Determines how many chunks DataFrame contains\n",
    "    def splits_num(self, elements: int) -> int:\n",
-    "        \"\"\" Determines how many chunks DataFrame contians. \"\"\"\n",
    "        return round(elements / self.batch_size)\n",
    "    \n",
    "    __call__ = to_batches\n",
@ -435,112 +429,40 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 99,
   "id": "7ea9ad46",
   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Pick a name for the new index\n",
-    "index_name = 'wikipedia-articles'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "3ff8eca1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/core/client/rest.py:45: DeprecationWarning: HTTPResponse.getheader() is deprecated and will be removed in urllib3 v2.1.0. Instead use HTTResponse.headers.get(name, default).\n",
-      "  return self.urllib3_response.getheader(name, default)\n",
-      "/var/folders/bs/rjtxlzk512103d0h0b1t18b40000gp/T/ipykernel_13361/2813989476.py:2: ResourceWarning: unclosed <ssl.SSLSocket fd=98, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55526), raddr=('34.127.5.128', 443)>\n",
-      "  if index_name in pinecone.list_indexes():\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
-      "/var/folders/bs/rjtxlzk512103d0h0b1t18b40000gp/T/ipykernel_13361/2813989476.py:3: ResourceWarning: unclosed <ssl.SSLSocket fd=98, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55527), raddr=('34.127.5.128', 443)>\n",
-      "  pinecone.delete_index(index_name)\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Check whether the index with the same name already exists\n",
-    "if index_name in pinecone.list_indexes():\n",
-    "    pinecone.delete_index(index_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "35cb853d",
-   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/manage.py:133: ResourceWarning: unclosed <ssl.SSLSocket fd=99, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55531), raddr=('34.127.5.128', 443)>\n",
-      "  status = _get_status(name)\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/manage.py:133: ResourceWarning: unclosed <ssl.SSLSocket fd=99, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55533), raddr=('34.127.5.128', 443)>\n",
-      "  status = _get_status(name)\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/manage.py:133: ResourceWarning: unclosed <ssl.SSLSocket fd=99, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55534), raddr=('34.127.5.128', 443)>\n",
-      "  status = _get_status(name)\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/manage.py:133: ResourceWarning: unclosed <ssl.SSLSocket fd=99, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55535), raddr=('34.127.5.128', 443)>\n",
-      "  status = _get_status(name)\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/manage.py:133: ResourceWarning: unclosed <ssl.SSLSocket fd=99, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55537), raddr=('34.127.5.128', 443)>\n",
-      "  status = _get_status(name)\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n",
-      "/var/folders/bs/rjtxlzk512103d0h0b1t18b40000gp/T/ipykernel_13361/3257515604.py:1: ResourceWarning: unclosed <ssl.SSLSocket fd=98, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55530), raddr=('34.127.5.128', 443)>\n",
-      "  pinecone.create_index(name=index_name, dimension=len(article_df['content_vector'][0]))\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n"
-     ]
-    }
-   ],
-   "source": [
-    "pinecone.create_index(name=index_name, dimension=len(article_df['content_vector'][0]))\n",
-    "index = pinecone.Index(index_name=index_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "1328ddaf",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/bs/rjtxlzk512103d0h0b1t18b40000gp/T/ipykernel_13361/2524688261.py:1: ResourceWarning: unclosed <ssl.SSLSocket fd=98, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('192.168.1.74', 55539), raddr=('34.127.5.128', 443)>\n",
-      "  pinecone.list_indexes()\n",
-      "ResourceWarning: Enable tracemalloc to get the object allocation traceback\n"
-     ]
-    },
    {
     "data": {
      "text/plain": [
       "['wikipedia-articles']"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
+    "# Pick a name for the new index\n",
+    "index_name = 'wikipedia-articles'\n",
+    "\n",
+    "# Check whether the index with the same name already exists - if so, delete it\n",
+    "if index_name in pinecone.list_indexes():\n",
+    "    pinecone.delete_index(index_name)\n",
+    "    \n",
+    "# Creates new index\n",
+    "pinecone.create_index(name=index_name, dimension=len(article_df['content_vector'][0]))\n",
+    "index = pinecone.Index(index_name=index_name)\n",
+    "\n",
    "# Confirm our index was created\n",
    "pinecone.list_indexes()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 100,
   "id": "5daeba00",
   "metadata": {},
   "outputs": [
@ -561,7 +483,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 101,
   "id": "5fc1b083",
   "metadata": {},
   "outputs": [
@ -582,7 +504,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 102,
   "id": "f90c7fba",
   "metadata": {},
   "outputs": [
@ -596,7 +518,7 @@
       " 'total_vector_count': 100000}"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -618,7 +540,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 103,
   "id": "d701b3c7",
   "metadata": {},
   "outputs": [],
@ -630,7 +552,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 104,
   "id": "3c8c2aa1",
   "metadata": {},
   "outputs": [],
@ -676,7 +598,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 105,
   "id": "67b3584d",
   "metadata": {},
   "outputs": [
@ -687,32 +609,23 @@
      "\n",
      "Most similar results querying modern art in Europe in \"title\" namespace:\n",
      "\n",
-      "Result 1 with a score of 0.891034067 is Early modern Europe\n",
-      "Result 2 with a score of 0.87504226 is Museum of Modern Art\n",
-      "Result 3 with a score of 0.867497 is Western Europe\n",
-      "Result 4 with a score of 0.864146471 is Renaissance art\n",
-      "Result 5 with a score of 0.860363305 is Pop art\n",
+      "Result 1 with a score of 0.890994787 is Early modern Europe\n",
+      "Result 2 with a score of 0.875286043 is Museum of Modern Art\n",
+      "Result 3 with a score of 0.867404044 is Western Europe\n",
+      "Result 4 with a score of 0.864250064 is Renaissance art\n",
+      "Result 5 with a score of 0.860506058 is Pop art\n",
      "\n",
      "\n"
     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/colin.jarvis/Documents/dev/vectordb_cookbook/vectordb/lib/python3.10/site-packages/pinecone/core/client/rest.py:45: DeprecationWarning: HTTPResponse.getheader() is deprecated and will be removed in urllib3 v2.1.0. Instead use HTTResponse.headers.get(name, default).\n",
-      "  return self.urllib3_response.getheader(name, default)\n"
-     ]
    }
   ],
   "source": [
-    "query_output = query_article('modern art in Europe','title')\n",
-    "#query_output"
+    "query_output = query_article('modern art in Europe','title')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 106,
   "id": "3e7ac79b",
   "metadata": {},
   "outputs": [
@ -734,8 +647,7 @@
    }
   ],
   "source": [
-    "content_query_output = query_article(\"Famous battles in Scottish history\",'content')\n",
-    "#content_query_output"
+    "content_query_output = query_article(\"Famous battles in Scottish history\",'content')"
   ]
  },
  {
@ -771,7 +683,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 107,
   "id": "b9ea472d",
   "metadata": {},
   "outputs": [],
@ -781,7 +693,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 108,
   "id": "13be220d",
   "metadata": {},
   "outputs": [
@ -791,7 +703,7 @@
       "{'classes': []}"
      ]
     },
-     "execution_count": 30,
+     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -803,7 +715,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 109,
   "id": "73d33184",
   "metadata": {},
   "outputs": [
@ -813,7 +725,7 @@
       "True"
      ]
     },
-     "execution_count": 31,
+     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -838,42 +750,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 110,
   "id": "e868d143",
   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class_obj = {\n",
-    "    \"class\": \"Article\",\n",
-    "    \"vectorizer\": \"none\", # explicitly tell Weaviate not to vectorize anything, we are providing the vectors ourselves through our BERT model\n",
-    "    \"properties\": [{\n",
-    "        \"name\": \"title\",\n",
-    "        \"description\": \"Title of the article\",\n",
-    "        \"dataType\": [\"text\"]\n",
-    "    },\n",
-    "        {\n",
-    "        \"name\": \"content\",\n",
-    "        \"description\": \"Contents of the article\",\n",
-    "        \"dataType\": [\"text\"]\n",
-    "    }]\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "d8d430d0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "client.schema.create_class(class_obj)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "856f20f9",
-   "metadata": {},
   "outputs": [
    {
     "data": {
@ -913,77 +792,37 @@
       "   'vectorizer': 'none'}]}"
      ]
     },
-     "execution_count": 34,
+     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
+    "class_obj = {\n",
+    "    \"class\": \"Article\",\n",
+    "    \"vectorizer\": \"none\", # explicitly tell Weaviate not to vectorize anything, we are providing the vectors ourselves through our BERT model\n",
+    "    \"properties\": [{\n",
+    "        \"name\": \"title\",\n",
+    "        \"description\": \"Title of the article\",\n",
+    "        \"dataType\": [\"text\"]\n",
+    "    },\n",
+    "        {\n",
+    "        \"name\": \"content\",\n",
+    "        \"description\": \"Contents of the article\",\n",
+    "        \"dataType\": [\"text\"]\n",
+    "    }]\n",
+    "}\n",
+    "\n",
+    "# Create the schema in Weaviate\n",
+    "client.schema.create_class(class_obj)\n",
+    "\n",
+    "# Check that we've created it as intended\n",
    "client.schema.get()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
-   "id": "e6f48f6f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<weaviate.batch.crud_batch.Batch at 0x16ad2fe20>"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "client.batch.configure(\n",
-    "  # `batch_size` takes an `int` value to enable auto-batching\n",
-    "  # (`None` is used for manual batching)\n",
-    "  batch_size=100, \n",
-    "  # dynamically update the `batch_size` based on import speed\n",
-    "  dynamic=False,\n",
-    "  # `timeout_retries` takes an `int` value to retry on time outs\n",
-    "  timeout_retries=3,\n",
-    "  # checks for batch-item creation errors\n",
-    "  # this is the default in weaviate-client >= 3.6.0\n",
-    "  callback=weaviate.util.check_batch_result,\n",
-    ")\n",
-    "#result = client.batch.create_objects(batch)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "7c40c204",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Make a list of tuples\n",
-    "data_objects = []\n",
-    "for k,v in article_df.iterrows():\n",
-    "    data_objects.append((v['title'],v['text'],v['title_vector'],v['vector_id']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "bb5eb2c1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Template function for setting up parallel upload process\n",
-    "def transcription_extractor(audio_filepath):\n",
-    "    response = call_asr(openai.api_key,audio_filepath)\n",
-    "    return(response)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 111,
   "id": "786d437f",
   "metadata": {},
   "outputs": [
@ -991,11 +830,16 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Uploading vectors to article_schema..\n"
+      "Uploading vectors to article schema..\n"
     ]
    }
   ],
   "source": [
+    "# Convert DF into a list of tuples\n",
+    "data_objects = []\n",
+    "for k,v in article_df.iterrows():\n",
+    "    data_objects.append((v['title'],v['text'],v['title_vector'],v['vector_id']))\n",
+    "\n",
    "# Upsert into article schema\n",
    "print(\"Uploading vectors to article schema..\")\n",
    "uuids = []\n",
@ -1013,18 +857,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 112,
   "id": "3658693c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "{'content': 'Sociedade Esportiva Palmeiras, usually called Palmeiras, is a Brazilian football team. They are from São Paulo, Brazil. The team was founded  by an Italian-speaking community on August 26, 1914, as Palestra Itália. They changed to the name used now on September 14, 1942.\\n\\nThey play in green shirts, white shorts and green socks and are one of the most popular and traditional Brazilian clubs.\\n\\nPalmeiras plays at the Palestra Itália stadium, which has seats for 32,000. But in the past, local derbies against São Paulo or Corinthians were usually played in Morumbi stadium. However, the Arena Palestra Itália is under construction with capacity for 45,000 people, expected to be finalized in 2013.\\n\\nName \\n 1914–1942 S.S. Palestra Italia\\n 1942–present S.E. Palmeiras\\n\\nMain titles \\n Copa Rio: 1951\\n Libertadores Cup: 1999 and 2020\\n Copa Mercosul: 1998\\n Campeonato Brasileiro: 1960, 1967, 1967, 1969, 1972, 1973, 1993, 1994, 2016 and 2018 – greatest champion\\n Copa do Brasil: 1998, 2012, 2015 and 2020/21\\n Copa dos Campeões: 2000\\n Campeão do Século\\n Torneio Rio-SP: 1933, 1951, 1965, 1993 and 2000\\n Campeonato Paulista: 1920, 1926 (unbeaten), 1927, 1932 (unbeaten), 1933, 1934, 1936, 1940, 1942, 1944, 1947,1950, 1959 (super champions), 1963, 1966, 1972 (unbeaten), 1974, 1976, 1993, 1994, 1996, 2008 and 2020.\\n Campeonato Paulista Extra: 1926 (unbeaten) and 1938\\n\\nRelated pages\\n List of Brazilian football teams\\n\\nOther websites \\n Palmeiras official site \\n\\nFootball clubs in São Paulo (state)\\n1914 establishments in Brazil',\n",
-       " 'title': 'Sociedade Esportiva Palmeiras'}"
+       "{'content': 'Eddie Cantor (January 31, 1892 - October 10, 1964) was an American comedian, singer, actor, songwriter. Familiar to Broadway, radio and early television audiences, this \"Apostle of Pep\" was regarded almost as a family member by millions because his top-rated radio shows revealed intimate stories and amusing anecdotes about his wife Ida and five daughters. His eye-rolling song-and-dance routines eventually led to his nickname, Banjo Eyes, and in 1933, the artist Frederick J. Garner caricatured Cantor with large round and white eyes resembling the drum-like pot of a banjo. Cantor\\'s eyes became his trademark, often exaggerated in illustrations, and leading to his appearance on Broadway in the musical Banjo Eyes (1941). He was the original singer of 1929 hit song \"Makin\\' Whoopie\".\\n\\nReferences\\n\\nPresidents of the Screen Actors Guild\\nAmerican stage actors\\nComedians from New York City\\nAmerican Jews\\nActors from New York City\\nSingers from New York City\\nAmerican television actors\\nAmerican radio actors\\n1892 births\\n1964 deaths',\n",
+       " 'title': 'Eddie Cantor'}"
      ]
     },
-     "execution_count": 48,
+     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1038,21 +882,21 @@
   "id": "46050ca9",
   "metadata": {},
   "source": [
-    "### Search Data"
+    "### Search Data\n",
+    "\n",
+    "As above, we'll fire some queries at our new Index and get back results based on the closeness to our existing vectors"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 113,
   "id": "5acd5437",
   "metadata": {},
   "outputs": [],
   "source": [
    "def query_weaviate(query, schema, top_k=20):\n",
-    "    '''Queries an article using its title in the specified\n",
-    "     namespace and prints results.'''\n",
    "\n",
-    "    # Create vector embeddings based on the title column\n",
+    "    # Creates embedding vector from user query\n",
    "    embedded_query = openai.Embedding.create(\n",
    "                                                input=query,\n",
    "                                                model=MODEL,\n",
@ -1060,19 +904,18 @@
    "    \n",
    "    near_vector = {\"vector\": embedded_query}\n",
    "\n",
-    "    # Query namespace passed as parameter using title vector\n",
+    "    # Queries input schema with vectorised user query\n",
    "    query_result = client.query.get(schema,[\"title\",\"content\", \"_additional {certainty}\"]) \\\n",
    "    .with_near_vector(near_vector) \\\n",
    "    .with_limit(top_k) \\\n",
    "    .do()\n",
    "    \n",
-    "    return query_result\n",
-    "    # Print query results "
+    "    return query_result"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 114,
   "id": "15def653",
   "metadata": {},
   "outputs": [
@ -1081,25 +924,25 @@
     "output_type": "stream",
     "text": [
      "1. Title: Early modern Europe Certainty: 0.9454971551895142\n",
-      "2. Title: Museum of Modern Art Certainty: 0.9375567138195038\n",
-      "3. Title: Western Europe Certainty: 0.9336977899074554\n",
-      "4. Title: Renaissance art Certainty: 0.9321110248565674\n",
-      "5. Title: Pop art Certainty: 0.9302356243133545\n",
-      "6. Title: Art exhibition Certainty: 0.9281864166259766\n",
-      "7. Title: History of Europe Certainty: 0.9278470575809479\n",
-      "8. Title: Northern Europe Certainty: 0.9273118078708649\n",
+      "2. Title: Museum of Modern Art Certainty: 0.9376430511474609\n",
+      "3. Title: Western Europe Certainty: 0.9337018430233002\n",
+      "4. Title: Renaissance art Certainty: 0.932124525308609\n",
+      "5. Title: Pop art Certainty: 0.9302527010440826\n",
+      "6. Title: Art exhibition Certainty: 0.9282020926475525\n",
+      "7. Title: History of Europe Certainty: 0.927833616733551\n",
+      "8. Title: Northern Europe Certainty: 0.9273514151573181\n",
      "9. Title: Concert of Europe Certainty: 0.9268475472927094\n",
-      "10. Title: Hellenistic art Certainty: 0.9264660775661469\n",
-      "11. Title: Piet Mondrian Certainty: 0.9235712587833405\n",
+      "10. Title: Hellenistic art Certainty: 0.9264959394931793\n",
+      "11. Title: Piet Mondrian Certainty: 0.9235787093639374\n",
      "12. Title: Modernist literature Certainty: 0.9235587120056152\n",
-      "13. Title: European Capital of Culture Certainty: 0.9228664338588715\n",
-      "14. Title: Art film Certainty: 0.9217151403427124\n",
-      "15. Title: Europa Certainty: 0.9216068089008331\n",
+      "13. Title: European Capital of Culture Certainty: 0.9227772951126099\n",
+      "14. Title: Art film Certainty: 0.9217384457588196\n",
+      "15. Title: Europa Certainty: 0.9216940104961395\n",
      "16. Title: Art rock Certainty: 0.9212885200977325\n",
-      "17. Title: Central Europe Certainty: 0.9212862849235535\n",
-      "18. Title: Art Certainty: 0.9208334386348724\n",
-      "19. Title: European Certainty: 0.92069211602211\n",
-      "20. Title: Byzantine art Certainty: 0.920437216758728\n"
+      "17. Title: Central Europe Certainty: 0.9212715923786163\n",
+      "18. Title: Art Certainty: 0.9207542240619659\n",
+      "19. Title: European Certainty: 0.9207191467285156\n",
+      "20. Title: Byzantine art Certainty: 0.9204496443271637\n"
     ]
    }
   ],
@ -1113,7 +956,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 115,
   "id": "93c4a696",
   "metadata": {},
   "outputs": [
@ -1121,24 +964,24 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1. Title: Historic Scotland Certainty: 0.9464837908744812\n",
+      "1. Title: Historic Scotland Certainty: 0.9465253949165344\n",
      "2. Title: First War of Scottish Independence Certainty: 0.9461104869842529\n",
-      "3. Title: Battle of Bannockburn Certainty: 0.9455609619617462\n",
+      "3. Title: Battle of Bannockburn Certainty: 0.9455604553222656\n",
      "4. Title: Wars of Scottish Independence Certainty: 0.944368839263916\n",
-      "5. Title: Second War of Scottish Independence Certainty: 0.9395008385181427\n",
+      "5. Title: Second War of Scottish Independence Certainty: 0.9394940435886383\n",
      "6. Title: List of Scottish monarchs Certainty: 0.9366503059864044\n",
-      "7. Title: Kingdom of Scotland Certainty: 0.935274213552475\n",
-      "8. Title: Scottish Borders Certainty: 0.9317866265773773\n",
+      "7. Title: Kingdom of Scotland Certainty: 0.9353288412094116\n",
+      "8. Title: Scottish Borders Certainty: 0.9317235946655273\n",
      "9. Title: List of rivers of Scotland Certainty: 0.9296278059482574\n",
      "10. Title: Braveheart Certainty: 0.9294214248657227\n",
      "11. Title: John of Scotland Certainty: 0.9292325675487518\n",
      "12. Title: Duncan II of Scotland Certainty: 0.9291643798351288\n",
-      "13. Title: Bannockburn Certainty: 0.9291241466999054\n",
-      "14. Title: The Scotsman Certainty: 0.9280610680580139\n",
+      "13. Title: Bannockburn Certainty: 0.929103285074234\n",
+      "14. Title: The Scotsman Certainty: 0.9280981719493866\n",
      "15. Title: Flag of Scotland Certainty: 0.9270428121089935\n",
      "16. Title: Banff and Macduff Certainty: 0.9267247915267944\n",
-      "17. Title: Guardians of Scotland Certainty: 0.9260919094085693\n",
-      "18. Title: Scottish Parliament Certainty: 0.9252097904682159\n",
+      "17. Title: Guardians of Scotland Certainty: 0.9260668158531189\n",
+      "18. Title: Scottish Parliament Certainty: 0.9251855313777924\n",
      "19. Title: Holyrood Abbey Certainty: 0.925055593252182\n",
      "20. Title: Scottish Certainty: 0.9249534606933594\n"
     ]
@ -1157,9 +1000,7 @@
   "id": "ad74202e",
   "metadata": {},
   "source": [
-    "Thanks for following along, you're now equipped to set up your own vector databases and use embeddings to do all kinds of cool things - enjoy! For more complex use cases please continue to work through the cookbook examples here:\n",
-    "\n",
-    "TODO: Make other cool things to link to"
+    "Thanks for following along, you're now equipped to set up your own vector databases and use embeddings to do all kinds of cool things - enjoy! For more complex use cases please continue to work through other cookbook examples in this repo"
   ]
  }
 ],