Updated code cell numbers

pull/1220/head
Mandeep Singh 4 weeks ago
parent 30f87d7e85
commit 15d7bcd19b

@ -42,20 +42,18 @@
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8d8810f9",
"metadata": {},
"outputs": [],
"source": [
"# We'll need to install Qdrant client\n",
"!pip install qdrant-client\n",
"\n",
"#Install wget to pull zip file\n",
"!pip install wget"
],
"outputs": [],
"execution_count": null
"!pip install qdrant-client"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5be94df6",
"metadata": {
"ExecuteTime": {
@ -63,14 +61,12 @@
"start_time": "2024-05-21T23:49:06.923221Z"
}
},
"outputs": [],
"source": [
"import openai\n",
"\n",
"import pandas as pd\n",
"from ast import literal_eval\n",
"\n",
"# Qdrant's client library for Python\n",
"import qdrant_client\n",
"import qdrant_client # Qdrant's client library for Python\n",
"\n",
"# This can be changed to the embedding model of your choice. Make sure its the same model that is used for generating embeddings\n",
"EMBEDDING_MODEL = \"text-embedding-ada-002\"\n",
@ -80,9 +76,7 @@
"\n",
"warnings.filterwarnings(action=\"ignore\", message=\"unclosed\", category=ResourceWarning)\n",
"warnings.filterwarnings(\"ignore\", category=DeprecationWarning) "
],
"outputs": [],
"execution_count": 88
]
},
{
"cell_type": "markdown",
@ -96,6 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5dff8b55",
"metadata": {
"ExecuteTime": {
@ -103,6 +98,7 @@
"start_time": "2024-05-21T23:49:41.132888Z"
}
},
"outputs": [],
"source": [
"import requests\n",
"\n",
@ -112,12 +108,11 @@
"response = requests.get(embeddings_url, verify=True) # Set verify=False to bypass SSL verification\n",
"with open('vector_database_wikipedia_articles_embedded.zip', 'wb') as file:\n",
" file.write(response.content)"
],
"outputs": [],
"execution_count": 89
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "21097972",
"metadata": {
"ExecuteTime": {
@ -125,16 +120,16 @@
"start_time": "2024-05-21T23:50:53.171125Z"
}
},
"outputs": [],
"source": [
"import zipfile\n",
"with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n",
" zip_ref.extractall(\"../data\")"
],
"outputs": [],
"execution_count": 90
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "70bbd8ba",
"metadata": {
"ExecuteTime": {
@ -142,14 +137,14 @@
"start_time": "2024-05-21T23:50:57.592940Z"
}
},
"outputs": [],
"source": [
"article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')"
],
"outputs": [],
"execution_count": 91
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1721e45d",
"metadata": {
"ExecuteTime": {
@ -157,41 +152,9 @@
"start_time": "2024-05-21T23:51:13.700231Z"
}
},
"source": [
"article_df.head()"
],
"outputs": [
{
"data": {
"text/plain": [
" id url title \\\n",
"0 1 https://simple.wikipedia.org/wiki/April April \n",
"1 2 https://simple.wikipedia.org/wiki/August August \n",
"2 6 https://simple.wikipedia.org/wiki/Art Art \n",
"3 8 https://simple.wikipedia.org/wiki/A A \n",
"4 9 https://simple.wikipedia.org/wiki/Air Air \n",
"\n",
" text \\\n",
"0 April is the fourth month of the year in the J... \n",
"1 August (Aug.) is the eighth month of the year ... \n",
"2 Art is a creative activity that expresses imag... \n",
"3 A or a is the first letter of the English alph... \n",
"4 Air refers to the Earth's atmosphere. Air is a... \n",
"\n",
" title_vector \\\n",
"0 [0.001009464613161981, -0.020700545981526375, ... \n",
"1 [0.0009286514250561595, 0.000820168002974242, ... \n",
"2 [0.003393713850528002, 0.0061537534929811954, ... \n",
"3 [0.0153952119871974, -0.013759135268628597, 0.... \n",
"4 [0.02224554680287838, -0.02044147066771984, -0... \n",
"\n",
" content_vector vector_id \n",
"0 [-0.011253940872848034, -0.013491976074874401,... 0 \n",
"1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n",
"2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n",
"3 [0.024894846603274345, -0.022186409682035446, ... 3 \n",
"4 [0.021524671465158463, 0.018522677943110466, -... 4 "
],
"text/html": [
"<div>\n",
"<style scoped>\n",
@ -274,6 +237,35 @@
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id url title \\\n",
"0 1 https://simple.wikipedia.org/wiki/April April \n",
"1 2 https://simple.wikipedia.org/wiki/August August \n",
"2 6 https://simple.wikipedia.org/wiki/Art Art \n",
"3 8 https://simple.wikipedia.org/wiki/A A \n",
"4 9 https://simple.wikipedia.org/wiki/Air Air \n",
"\n",
" text \\\n",
"0 April is the fourth month of the year in the J... \n",
"1 August (Aug.) is the eighth month of the year ... \n",
"2 Art is a creative activity that expresses imag... \n",
"3 A or a is the first letter of the English alph... \n",
"4 Air refers to the Earth's atmosphere. Air is a... \n",
"\n",
" title_vector \\\n",
"0 [0.001009464613161981, -0.020700545981526375, ... \n",
"1 [0.0009286514250561595, 0.000820168002974242, ... \n",
"2 [0.003393713850528002, 0.0061537534929811954, ... \n",
"3 [0.0153952119871974, -0.013759135268628597, 0.... \n",
"4 [0.02224554680287838, -0.02044147066771984, -0... \n",
"\n",
" content_vector vector_id \n",
"0 [-0.011253940872848034, -0.013491976074874401,... 0 \n",
"1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n",
"2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n",
"3 [0.024894846603274345, -0.022186409682035446, ... 3 \n",
"4 [0.021524671465158463, 0.018522677943110466, -... 4 "
]
},
"execution_count": 92,
@ -281,10 +273,13 @@
"output_type": "execute_result"
}
],
"execution_count": 92
"source": [
"article_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "960b82af",
"metadata": {
"ExecuteTime": {
@ -292,6 +287,7 @@
"start_time": "2024-05-21T23:51:16.274336Z"
}
},
"outputs": [],
"source": [
"# Read vectors from strings back into a list\n",
"article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n",
@ -299,12 +295,11 @@
"\n",
"# Set vector_id to be a string\n",
"article_df['vector_id'] = article_df['vector_id'].apply(str)"
],
"outputs": [],
"execution_count": 93
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a334ab8b",
"metadata": {
"ExecuteTime": {
@ -312,9 +307,6 @@
"start_time": "2024-05-21T23:55:36.038710Z"
}
},
"source": [
"article_df.info(show_counts=True)"
],
"outputs": [
{
"name": "stdout",
@ -337,7 +329,9 @@
]
}
],
"execution_count": 94
"source": [
"article_df.info(show_counts=True)"
]
},
{
"cell_type": "markdown",
@ -370,6 +364,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"id": "76d697e9",
"metadata": {
"ExecuteTime": {
@ -377,12 +372,14 @@
"start_time": "2024-05-21T23:55:56.517724Z"
}
},
"source": "qdrant = qdrant_client.QdrantClient(host=\"localhost\", port=6333)",
"outputs": [],
"execution_count": 95
"source": [
"qdrant = qdrant_client.QdrantClient(host=\"localhost\", port=6333)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "1deeb539",
"metadata": {
"ExecuteTime": {
@ -390,9 +387,6 @@
"start_time": "2024-05-21T23:55:57.312830Z"
}
},
"source": [
"qdrant.get_collections()"
],
"outputs": [
{
"data": {
@ -405,7 +399,9 @@
"output_type": "execute_result"
}
],
"execution_count": 96
"source": [
"qdrant.get_collections()"
]
},
{
"cell_type": "markdown",
@ -421,6 +417,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1a84ee1d",
"metadata": {
"ExecuteTime": {
@ -428,14 +425,14 @@
"start_time": "2024-05-21T23:56:04.064878Z"
}
},
"outputs": [],
"source": [
"from qdrant_client.http import models as rest"
],
"outputs": [],
"execution_count": 97
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "00876f92",
"metadata": {
"ExecuteTime": {
@ -443,6 +440,18 @@
"start_time": "2024-05-21T23:56:05.247948Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the vector size from the first row to set up the collection\n",
"vector_size = len(article_df['content_vector'][0])\n",
@ -461,7 +470,18 @@
" ),\n",
" }\n",
")"
],
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9f39a8c395554ca3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-21T23:56:21.577594Z",
"start_time": "2024-05-21T23:56:21.460740Z"
}
},
"outputs": [
{
"data": {
@ -469,21 +489,11 @@
"True"
]
},
"execution_count": 98,
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 98
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-21T23:56:21.577594Z",
"start_time": "2024-05-21T23:56:21.460740Z"
}
},
"cell_type": "code",
"source": [
"vector_size = len(article_df['content_vector'][0])\n",
"\n",
@ -500,30 +510,19 @@
" ),\n",
" }\n",
")"
],
"id": "9f39a8c395554ca3",
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 99
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "In addition to the vector configuration defined under `vector`, we can also define the `payload` configuration. Payload is an optional field that allows you to store additional metadata alongside the vectors. In our case, we'll store the `id`, `title`, and `url` of the articles. As we return the title of nearest articles in the search results from payload, we can also provide the user with the URL to the article (which is part of the meta-data).",
"id": "e95be6e0c9af4c21"
"id": "e95be6e0c9af4c21",
"metadata": {},
"source": [
"In addition to the vector configuration defined under `vector`, we can also define the `payload` configuration. Payload is an optional field that allows you to store additional metadata alongside the vectors. In our case, we'll store the `id`, `title`, and `url` of the articles. As we return the title of nearest articles in the search results from payload, we can also provide the user with the URL to the article (which is part of the meta-data)."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f24e76ab",
"metadata": {
"ExecuteTime": {
@ -531,6 +530,15 @@
"start_time": "2024-05-21T23:56:50.664145Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Upserting articles: 100%|██████████| 25000/25000 [01:34<00:00, 264.52it/s]\n"
]
}
],
"source": [
"from qdrant_client.models import PointStruct # Import the PointStruct to store the vector and payload\n",
"from tqdm import tqdm # Library to show the progress bar \n",
@ -556,20 +564,11 @@
" except Exception as e:\n",
" print(f\"Failed to upsert row {k}: {v}\")\n",
" print(f\"Exception: {e}\")"
],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Upserting articles: 100%|██████████| 25000/25000 [01:34<00:00, 264.52it/s]\n"
]
}
],
"execution_count": 100
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "d1188a12",
"metadata": {
"ExecuteTime": {
@ -577,10 +576,6 @@
"start_time": "2024-05-21T23:58:27.549740Z"
}
},
"source": [
"# Check the collection size to make sure all the points have been stored\n",
"qdrant.count(collection_name='Articles')"
],
"outputs": [
{
"data": {
@ -593,7 +588,10 @@
"output_type": "execute_result"
}
],
"execution_count": 101
"source": [
"# Check the collection size to make sure all the points have been stored\n",
"qdrant.count(collection_name='Articles')"
]
},
{
"cell_type": "markdown",
@ -607,6 +605,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f1bac4ef",
"metadata": {
"ExecuteTime": {
@ -614,6 +613,7 @@
"start_time": "2024-05-21T23:58:35.488963Z"
}
},
"outputs": [],
"source": [
"def query_qdrant(query, collection_name, vector_name='title', top_k=20):\n",
"\n",
@ -633,12 +633,11 @@
" )\n",
" \n",
" return query_results"
],
"outputs": [],
"execution_count": 102
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "aa92f3d3",
"metadata": {
"ExecuteTime": {
@ -646,11 +645,6 @@
"start_time": "2024-05-21T23:58:36.949491Z"
}
},
"source": [
"query_results = query_qdrant('modern art in Europe', 'Articles', 'title')\n",
"for i, article in enumerate(query_results):\n",
" print(f'{i + 1}. {article.payload[\"title\"]}, URL: {article.payload['url']} (Score: {round(article.score, 3)})')"
],
"outputs": [
{
"name": "stdout",
@ -679,10 +673,15 @@
]
}
],
"execution_count": 103
"source": [
"query_results = query_qdrant('modern art in Europe', 'Articles', 'title')\n",
"for i, article in enumerate(query_results):\n",
" print(f'{i + 1}. {article.payload[\"title\"]}, URL: {article.payload['url']} (Score: {round(article.score, 3)})')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7ed116b8",
"metadata": {
"ExecuteTime": {
@ -690,12 +689,6 @@
"start_time": "2024-05-21T23:58:52.924091Z"
}
},
"source": [
"# This time we'll query using content vector\n",
"query_results = query_qdrant('Famous battles in Scottish history', 'Articles', 'content')\n",
"for i, article in enumerate(query_results):\n",
" print(f'{i + 1}. {article.payload[\"title\"]}, URL: {article.payload['url']} (Score: {round(article.score, 3)})')"
],
"outputs": [
{
"name": "stdout",
@ -724,15 +717,20 @@
]
}
],
"execution_count": 104
"source": [
"# This time we'll query using content vector\n",
"query_results = query_qdrant('Famous battles in Scottish history', 'Articles', 'content')\n",
"for i, article in enumerate(query_results):\n",
" print(f'{i + 1}. {article.payload[\"title\"]}, URL: {article.payload['url']} (Score: {round(article.score, 3)})')"
]
},
{
"metadata": {},
"cell_type": "code",
"execution_count": 19,
"id": "cd4f750dc6daa2e8",
"metadata": {},
"outputs": [],
"execution_count": null,
"source": "",
"id": "cd4f750dc6daa2e8"
"source": []
}
],
"metadata": {

Loading…
Cancel
Save