Pushing update to remove data loading

2024-11-04 06:00:33 +00:00 · 2023-02-06 03:48:31 -08:00 · 2023-02-06 03:48:31 -08:00 · 3ad0e718cb
commit 3ad0e718cb
parent befe771b2c
1 changed files with 81 additions and 515 deletions
--- a/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb
+++ b/examples/vector_databases/Using_vector_databases_for_embeddings_search.ipynb
@ -58,12 +58,15 @@
    "# We'll need to install the clients for all vector databases\n",
    "!pip install pinecone-client\n",
    "!pip install weaviate-client\n",
-    "!pip install qdrant-client"
+    "!pip install qdrant-client\n",
+    "\n",
+    "#Install wget to pull zip file\n",
+    "!pip install wget"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "5be94df6",
   "metadata": {},
   "outputs": [],
@ -71,14 +74,12 @@
    "import openai\n",
    "\n",
    "import tiktoken\n",
-    "from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
    "from typing import List, Iterator\n",
-    "import concurrent\n",
-    "from tqdm import tqdm\n",
    "import pandas as pd\n",
-    "from datasets import load_dataset\n",
    "import numpy as np\n",
    "import os\n",
+    "import wget\n",
+    "from ast import literal_eval\n",
    "\n",
    "# Pinecone's client library for Python\n",
    "import pinecone\n",
@ -106,287 +107,71 @@
   "source": [
    "## Load data\n",
    "\n",
-    "In this section we'll source the data for this task, embed it and format it for insertion into a vector database"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "bd99e08e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Simple function to take in a list of text objects and return them as a list of embeddings\n",
-    "def get_embeddings(input: List):\n",
-    "    response = openai.Embedding.create(\n",
-    "        input=input,\n",
-    "        model=EMBEDDING_MODEL,\n",
-    "    )[\"data\"]\n",
-    "    return [data[\"embedding\"] for data in response]\n",
-    "\n",
-    "def batchify(iterable, n=1):\n",
-    "    l = len(iterable)\n",
-    "    for ndx in range(0, l, n):\n",
-    "        yield iterable[ndx : min(ndx + n, l)]\n",
-    "\n",
-    "# Function for batching and parallel processing the embeddings\n",
-    "def embed_corpus(\n",
-    "    corpus: List[str],\n",
-    "    batch_size=64,\n",
-    "    num_workers=8,\n",
-    "    max_context_len=8191,\n",
-    "):\n",
-    "\n",
-    "    # Encode the corpus, truncating to max_context_len\n",
-    "    encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
-    "    encoded_corpus = [\n",
-    "        encoded_article[:max_context_len] for encoded_article in encoding.encode_batch(corpus)\n",
-    "    ]\n",
-    "\n",
-    "    # Calculate corpus statistics: the number of inputs, the total number of tokens, and the estimated cost to embed\n",
-    "    num_tokens = sum(len(article) for article in encoded_corpus)\n",
-    "    cost_to_embed_tokens = num_tokens / 1_000 * 0.0004\n",
-    "    print(\n",
-    "        f\"num_articles={len(encoded_corpus)}, num_tokens={num_tokens}, est_embedding_cost={cost_to_embed_tokens:.2f} USD\"\n",
-    "    )\n",
-    "\n",
-    "    # Embed the corpus\n",
-    "    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
-    "        \n",
-    "        futures = [\n",
-    "            executor.submit(get_embeddings, text_batch)\n",
-    "            for text_batch in batchify(encoded_corpus, batch_size)\n",
-    "        ]\n",
-    "\n",
-    "        with tqdm(total=len(encoded_corpus)) as pbar:\n",
-    "            for _ in concurrent.futures.as_completed(futures):\n",
-    "                pbar.update(batch_size)\n",
-    "\n",
-    "        embeddings = []\n",
-    "        for future in futures:\n",
-    "            data = future.result()\n",
-    "            embeddings.extend(data)\n",
-    "\n",
-    "        return embeddings"
+    "In this section we'll load embedded data that we've prepared previous to this session."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0c1c73cb",
+   "id": "5dff8b55",
   "metadata": {},
   "outputs": [],
   "source": [
-    "# We'll use the datasets library to pull the Simple Wikipedia dataset for embedding\n",
-    "dataset = list(load_dataset(\"wikipedia\", \"20220301.simple\")[\"train\"])\n",
-    "# Limited to 25k articles for demo purposes\n",
-    "dataset = dataset[:25_000]  "
+    "embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n",
+    "\n",
+    "# Warning, the file is pretty big so this will take some time\n",
+    "wget.download(embeddings_url)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "id": "e6ee90ce",
+   "execution_count": null,
+   "id": "21097972",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "num_articles=25000, num_tokens=12896881, est_embedding_cost=5.16 USD\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "25024it [01:06, 377.31it/s]                                                                                                                                           "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 16.3 s, sys: 2.24 s, total: 18.5 s\n",
-      "Wall time: 1min 8s\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "%%time\n",
-    "# Embed the article text\n",
-    "dataset_embeddings = embed_corpus([article[\"text\"] for article in dataset])"
+    "import zipfile\n",
+    "with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n",
+    "    zip_ref.extractall(\"../data\")\n",
+    "    \n",
+    "article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "id": "850c7215",
+   "execution_count": null,
+   "id": "1721e45d",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "num_articles=25000, num_tokens=88300, est_embedding_cost=0.04 USD\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "25024it [00:36, 683.22it/s]                                                                                                                                           \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "# Embed the article titles separately\n",
-    "title_embeddings = embed_corpus([article[\"title\"] for article in dataset])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "id": "1410daaa",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>url</th>\n",
-       "      <th>title</th>\n",
-       "      <th>text</th>\n",
-       "      <th>title_vector</th>\n",
-       "      <th>content_vector</th>\n",
-       "      <th>vector_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>https://simple.wikipedia.org/wiki/April</td>\n",
-       "      <td>April</td>\n",
-       "      <td>April is the fourth month of the year in the J...</td>\n",
-       "      <td>[0.001009464613161981, -0.020700545981526375, ...</td>\n",
-       "      <td>[-0.011253940872848034, -0.013491976074874401,...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>https://simple.wikipedia.org/wiki/August</td>\n",
-       "      <td>August</td>\n",
-       "      <td>August (Aug.) is the eighth month of the year ...</td>\n",
-       "      <td>[0.0009286514250561595, 0.000820168002974242, ...</td>\n",
-       "      <td>[0.0003609954728744924, 0.007262262050062418, ...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>6</td>\n",
-       "      <td>https://simple.wikipedia.org/wiki/Art</td>\n",
-       "      <td>Art</td>\n",
-       "      <td>Art is a creative activity that expresses imag...</td>\n",
-       "      <td>[0.003393713850528002, 0.0061537534929811954, ...</td>\n",
-       "      <td>[-0.004959689453244209, 0.015772193670272827, ...</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>8</td>\n",
-       "      <td>https://simple.wikipedia.org/wiki/A</td>\n",
-       "      <td>A</td>\n",
-       "      <td>A or a is the first letter of the English alph...</td>\n",
-       "      <td>[0.0153952119871974, -0.013759135268628597, 0....</td>\n",
-       "      <td>[0.024894846603274345, -0.022186409682035446, ...</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>9</td>\n",
-       "      <td>https://simple.wikipedia.org/wiki/Air</td>\n",
-       "      <td>Air</td>\n",
-       "      <td>Air refers to the Earth's atmosphere. Air is a...</td>\n",
-       "      <td>[0.02224554680287838, -0.02044147066771984, -0...</td>\n",
-       "      <td>[0.021524671465158463, 0.018522677943110466, -...</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  id                                       url   title  \\\n",
-       "0  1   https://simple.wikipedia.org/wiki/April   April   \n",
-       "1  2  https://simple.wikipedia.org/wiki/August  August   \n",
-       "2  6     https://simple.wikipedia.org/wiki/Art     Art   \n",
-       "3  8       https://simple.wikipedia.org/wiki/A       A   \n",
-       "4  9     https://simple.wikipedia.org/wiki/Air     Air   \n",
-       "\n",
-       "                                                text  \\\n",
-       "0  April is the fourth month of the year in the J...   \n",
-       "1  August (Aug.) is the eighth month of the year ...   \n",
-       "2  Art is a creative activity that expresses imag...   \n",
-       "3  A or a is the first letter of the English alph...   \n",
-       "4  Air refers to the Earth's atmosphere. Air is a...   \n",
-       "\n",
-       "                                        title_vector  \\\n",
-       "0  [0.001009464613161981, -0.020700545981526375, ...   \n",
-       "1  [0.0009286514250561595, 0.000820168002974242, ...   \n",
-       "2  [0.003393713850528002, 0.0061537534929811954, ...   \n",
-       "3  [0.0153952119871974, -0.013759135268628597, 0....   \n",
-       "4  [0.02224554680287838, -0.02044147066771984, -0...   \n",
-       "\n",
-       "                                      content_vector vector_id  \n",
-       "0  [-0.011253940872848034, -0.013491976074874401,...         0  \n",
-       "1  [0.0003609954728744924, 0.007262262050062418, ...         1  \n",
-       "2  [-0.004959689453244209, 0.015772193670272827, ...         2  \n",
-       "3  [0.024894846603274345, -0.022186409682035446, ...         3  \n",
-       "4  [0.021524671465158463, 0.018522677943110466, -...         4  "
-      ]
-     },
-     "execution_count": 122,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# We will then store the result in another dataframe, and prep the data for insertion into a vector DB\n",
-    "article_df = pd.DataFrame(dataset)\n",
-    "article_df['title_vector'] = title_embeddings\n",
-    "article_df['content_vector'] = dataset_embeddings\n",
-    "article_df['vector_id'] = article_df.index\n",
-    "article_df['vector_id'] = article_df['vector_id'].apply(str)\n",
    "article_df.head()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "960b82af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read vectors from strings back into a list\n",
+    "#article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n",
+    "article_df['content_vector'] = article_df.content_vector.apply(literal_eval)\n",
+    "\n",
+    "# Set vector_id to be a string\n",
+    "article_df['vector_id'] = article_df['vector_id'].apply(str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a334ab8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(article_df['title_vector'][0])"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "ed32fc87",
@ -406,7 +191,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "id": "92e6152a",
   "metadata": {},
   "outputs": [],
@ -429,7 +214,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": null,
   "id": "0a71c575",
   "metadata": {},
   "outputs": [],
@ -461,21 +246,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": null,
   "id": "7ea9ad46",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['wikipedia-articles']"
-      ]
-     },
-     "execution_count": 124,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Pick a name for the new index\n",
    "index_name = 'wikipedia-articles'\n",
@ -494,18 +268,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": null,
   "id": "5daeba00",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Uploading vectors to content namespace..\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Upsert content vectors in content namespace - this can take a few minutes\n",
    "print(\"Uploading vectors to content namespace..\")\n",
@ -515,18 +281,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": null,
   "id": "5fc1b083",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Uploading vectors to title namespace..\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Upsert title vectors in title namespace - this can also take a few minutes\n",
    "print(\"Uploading vectors to title namespace..\")\n",
@ -536,25 +294,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": null,
   "id": "f90c7fba",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'dimension': 1536,\n",
-       " 'index_fullness': 0.1,\n",
-       " 'namespaces': {'content': {'vector_count': 25000},\n",
-       "                'title': {'vector_count': 25000}},\n",
-       " 'total_vector_count': 50000}"
-      ]
-     },
-     "execution_count": 128,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Check index size for each namespace to confirm all of our docs have loaded\n",
    "index.describe_index_stats()"
@ -584,7 +327,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": null,
   "id": "3c8c2aa1",
   "metadata": {},
   "outputs": [],
@ -681,7 +424,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": null,
   "id": "b9ea472d",
   "metadata": {},
   "outputs": [],
@ -691,21 +434,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": null,
   "id": "13be220d",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'classes': []}"
-      ]
-     },
-     "execution_count": 114,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "client.schema.delete_all()\n",
    "client.schema.get()"
@ -713,21 +445,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": null,
   "id": "73d33184",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 115,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "client.is_ready()"
   ]
@ -748,53 +469,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": null,
   "id": "e868d143",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'classes': [{'class': 'Article',\n",
-       "   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},\n",
-       "    'cleanupIntervalSeconds': 60,\n",
-       "    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},\n",
-       "   'properties': [{'dataType': ['text'],\n",
-       "     'description': 'Title of the article',\n",
-       "     'name': 'title',\n",
-       "     'tokenization': 'word'},\n",
-       "    {'dataType': ['text'],\n",
-       "     'description': 'Contents of the article',\n",
-       "     'name': 'content',\n",
-       "     'tokenization': 'word'}],\n",
-       "   'shardingConfig': {'virtualPerPhysical': 128,\n",
-       "    'desiredCount': 1,\n",
-       "    'actualCount': 1,\n",
-       "    'desiredVirtualCount': 128,\n",
-       "    'actualVirtualCount': 128,\n",
-       "    'key': '_id',\n",
-       "    'strategy': 'hash',\n",
-       "    'function': 'murmur3'},\n",
-       "   'vectorIndexConfig': {'skip': False,\n",
-       "    'cleanupIntervalSeconds': 300,\n",
-       "    'maxConnections': 64,\n",
-       "    'efConstruction': 128,\n",
-       "    'ef': -1,\n",
-       "    'dynamicEfMin': 100,\n",
-       "    'dynamicEfMax': 500,\n",
-       "    'dynamicEfFactor': 8,\n",
-       "    'vectorCacheMaxObjects': 2000000,\n",
-       "    'flatSearchCutoff': 40000,\n",
-       "    'distance': 'cosine'},\n",
-       "   'vectorIndexType': 'hnsw',\n",
-       "   'vectorizer': 'none'}]}"
-      ]
-     },
-     "execution_count": 116,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "class_obj = {\n",
    "    \"class\": \"Article\",\n",
@ -820,18 +498,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": null,
   "id": "786d437f",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Uploading vectors to article schema..\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Convert DF into a list of tuples\n",
    "data_objects = []\n",
@ -861,49 +531,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": null,
   "id": "3658693c",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Kim Jong-nam\n",
-      "Kim Jong-nam (May 10, 1971 - February 13, 2017) was the eldest son of Kim Jong-il, the former leader of North Korea.\n",
-      "\n",
-      "He tried to enter Japan using a fake passport in May 2001.  This was to visit Disneyland.  This caused his father to not approve of him. Kim Jong-nam's younger half-brother Kim Jong-un was made the heir in September 2010.\n",
-      "\n",
-      "In June 2010, Kim Jong-nam gave a brief interview to the Associated Press in Macau. He told the reporter that he had \"no plans\" to defect to Europe. The press had recently said this. Kim Jong-nam lived in an apartment on the southern tip of Macau's Coloane Island until 2007. An anonymous South Korean official reported in October 2010 that Jong-nam had not lived in Macau for \"months\", and now goes between China and \"another country.\"\n",
-      "\n",
-      "When his father died, Kim Jong-nam did not attend the funeral.  This was to avoid rumours on the succession.\n",
-      "\n",
-      "He was assassinated in Malaysia on February 13, 2017, which is believed to be ordered by his half-brother Kim Jong-un.\n",
-      "\n",
-      "Personal life\n",
-      "The South Korean newspaper The Chosun Ilbo said that Kim Jong-nam has two wives, at least one mistress, and several children. His first wife Shin Jong-hui (born c. 1980) and their son Kum-sol (born c. 1996) live at a home called Dragon Villa on the northern outskirts of Beijing. His second wife Lee Hye-kyong (born c. 1970), their son Han-sol (born c. 1995) and their daughter Sol-hui (born c. 1998) live in an apartment building in Macau. Jong-nam's mistress, former Air Koryo flight attendant So Yong-la (born c. 1980), also lives in Macau. \n",
-      "\n",
-      "Jong-nam is often given attention by the media for his gambling and extravagant spending.\n",
-      "\n",
-      "References\n",
-      "\n",
-      "1971 births\n",
-      "2017 deaths\n",
-      "Assassinated people\n",
-      "North Korean politicians\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'Aggregate': {'Article': [{'meta': {'count': 25000}}]}}"
-      ]
-     },
-     "execution_count": 118,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Test our insert has worked by checking one object\n",
    "print(client.data_object.get()['objects'][0]['properties']['title'])\n",
@ -928,7 +559,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": null,
   "id": "5acd5437",
   "metadata": {},
   "outputs": [],
@ -954,37 +585,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": null,
   "id": "15def653",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1. Museum of Modern Art (Score: 0.938)\n",
-      "2. Western Europe (Score: 0.934)\n",
-      "3. Renaissance art (Score: 0.932)\n",
-      "4. Pop art (Score: 0.93)\n",
-      "5. Northern Europe (Score: 0.927)\n",
-      "6. Hellenistic art (Score: 0.926)\n",
-      "7. Modernist literature (Score: 0.924)\n",
-      "8. Art film (Score: 0.922)\n",
-      "9. Central Europe (Score: 0.921)\n",
-      "10. Art (Score: 0.921)\n",
-      "11. European (Score: 0.921)\n",
-      "12. Byzantine art (Score: 0.92)\n",
-      "13. Postmodernism (Score: 0.92)\n",
-      "14. Eastern Europe (Score: 0.92)\n",
-      "15. Cubism (Score: 0.92)\n",
-      "16. Europe (Score: 0.919)\n",
-      "17. Impressionism (Score: 0.919)\n",
-      "18. Bauhaus (Score: 0.919)\n",
-      "19. Surrealism (Score: 0.919)\n",
-      "20. Expressionism (Score: 0.919)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "query_result = query_weaviate('modern art in Europe','Article')\n",
    "counter = 0\n",
@ -995,37 +599,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
   "id": "93c4a696",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1. Historic Scotland (Score: 0.946)\n",
-      "2. First War of Scottish Independence (Score: 0.946)\n",
-      "3. Battle of Bannockburn (Score: 0.946)\n",
-      "4. Wars of Scottish Independence (Score: 0.944)\n",
-      "5. Second War of Scottish Independence (Score: 0.939)\n",
-      "6. List of Scottish monarchs (Score: 0.937)\n",
-      "7. Scottish Borders (Score: 0.932)\n",
-      "8. Braveheart (Score: 0.929)\n",
-      "9. John of Scotland (Score: 0.929)\n",
-      "10. Guardians of Scotland (Score: 0.926)\n",
-      "11. Holyrood Abbey (Score: 0.925)\n",
-      "12. Scottish (Score: 0.925)\n",
-      "13. Scots (Score: 0.925)\n",
-      "14. Robert I of Scotland (Score: 0.924)\n",
-      "15. Scottish people (Score: 0.924)\n",
-      "16. Alexander I of Scotland (Score: 0.924)\n",
-      "17. Edinburgh Castle (Score: 0.924)\n",
-      "18. Robert Burns (Score: 0.923)\n",
-      "19. Battle of Bosworth Field (Score: 0.922)\n",
-      "20. David II of Scotland (Score: 0.922)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "query_result = query_weaviate('Famous battles in Scottish history','Article')\n",
    "counter = 0\n",
@ -1063,7 +640,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": null,
   "id": "76d697e9",
   "metadata": {
    "ExecuteTime": {
@ -1078,7 +655,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": null,
   "id": "1deeb539",
   "metadata": {
    "ExecuteTime": {
@ -1086,18 +663,7 @@
     "start_time": "2023-01-18T09:29:19.727897Z"
    }
   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "CollectionsResponse(collections=[])"
-      ]
-     },
-     "execution_count": 100,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "qdrant.get_collections()"
   ]
@ -1116,7 +682,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": null,
   "id": "1a84ee1d",
   "metadata": {
    "ExecuteTime": {
@ -1131,7 +697,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": null,
   "id": "00876f92",
   "metadata": {
    "ExecuteTime": {