You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
openai-cookbook/examples/vector_databases/kusto/Getting_started_with_kusto_...

2 lines
56 KiB
Plaintext

{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":[]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["# Kusto as a Vector database for AI embeddings"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["This Notebook provides step by step instuctions on using Azure Data Explorer (Kusto) as a vector database with OpenAI embeddings. "]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["This notebook presents an end-to-end process of:\n","\n","1. Using precomputed embeddings created by OpenAI API.\n","2. Storing the embeddings in Kusto.\n","3. Converting raw text query to an embedding with OpenAI API.\n","4. Using Kusto to perform cosine similarity search in the stored embeddings\n"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["### Prerequisites"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["For the purposes of this exercise we need to prepare a couple of things:\n","\n","1. Azure Data Explorer(Kusto) server instance. https://azure.microsoft.com/en-us/products/data-explorer\n","3. Azure OpenAI credentials or OpenAI API key."]},{"cell_type":"code","execution_count":2,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:24:58.8253972Z","execution_start_time":"2023-05-10T09:24:58.8250545Z","livy_statement_state":"available","parent_msg_id":"affb2f05-b242-4152-99b2-f30e3f854c21","queued_time":"2023-05-10T09:24:43.3953963Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":-1},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, -1, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{},"execution_count":2,"metadata":{},"output_type":"execute_result"},{"name":"stdout","output_type":"stream","text":["Collecting wget\n"," Downloading wget-3.2.zip (10 kB)\n"," Preparing metadata (setup.py) ... \u001b[?25ldone\n","\u001b[?25hBuilding wheels for collected packages: wget\n"," Building wheel for wget (setup.py) ... \u001b[?25l-\b \bdone\n","\u001b[?25h Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=10fd8aa1d20fd49c36389dc888acc721d0578c5a0635fc9fc5dc642c0f49522e\n"," Stored in directory: /home/trusted-service-user/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769\n","Successfully built wget\n","Installing collected packages: wget\n","Successfully installed wget-3.2\n","\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/nfs4/pyenv-27214bb4-edfd-4fdd-b888-8a99075a1416/bin/python -m pip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]},{"data":{},"execution_count":2,"metadata":{},"output_type":"execute_result"},{"name":"stdout","output_type":"stream","text":["Warning: PySpark kernel has been restarted to use updated packages.\n","\n"]}],"source":["%pip install wget"]},{"cell_type":"code","execution_count":3,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:25:13.0187836Z","execution_start_time":"2023-05-10T09:25:13.0184873Z","livy_statement_state":"available","parent_msg_id":"d4c4cf5b-21c4-4804-9db2-1f18a6be7733","queued_time":"2023-05-10T09:24:43.5454914Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":-1},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, -1, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{},"execution_count":3,"metadata":{},"output_type":"execute_result"},{"name":"stdout","output_type":"stream","text":["Collecting openai\n"," Downloading openai-0.27.6-py3-none-any.whl (71 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.9/71.9 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from openai) (4.65.0)\n","Requirement already satisfied: requests>=2.20 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from openai) (2.28.2)\n","Requirement already satisfied: aiohttp in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from openai) (3.8.4)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.20->openai) (1.26.14)\n","Requirement already satisfied: certifi>=2017.4.17 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.20->openai) (2022.12.7)\n","Requirement already satisfied: idna<4,>=2.5 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.20->openai) (3.4)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.20->openai) (2.1.1)\n","Requirement already satisfied: attrs>=17.3.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from aiohttp->openai) (22.2.0)\n","Requirement already satisfied: frozenlist>=1.1.1 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from aiohttp->openai) (1.3.3)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from aiohttp->openai) (6.0.4)\n","Requirement already satisfied: yarl<2.0,>=1.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from aiohttp->openai) (1.8.2)\n","Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from aiohttp->openai) (4.0.2)\n","Requirement already satisfied: aiosignal>=1.1.2 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from aiohttp->openai) (1.3.1)\n","Installing collected packages: openai\n","Successfully installed openai-0.27.6\n","\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/nfs4/pyenv-27214bb4-edfd-4fdd-b888-8a99075a1416/bin/python -m pip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]},{"data":{},"execution_count":3,"metadata":{},"output_type":"execute_result"},{"name":"stdout","output_type":"stream","text":["Warning: PySpark kernel has been restarted to use updated packages.\n","\n"]}],"source":["%pip install openai"]},{"cell_type":"code","execution_count":17,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:40:42.8774236Z","execution_start_time":"2023-05-10T09:40:42.8771662Z","livy_statement_state":"available","parent_msg_id":"7a7dfea4-d420-4b03-a2c6-82c65dfe177e","queued_time":"2023-05-10T09:40:30.9990512Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":-1},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, -1, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{},"execution_count":17,"metadata":{},"output_type":"execute_result"},{"name":"stdout","output_type":"stream","text":["Requirement already satisfied: azure-kusto-data in /nfs4/pyenv-27214bb4-edfd-4fdd-b888-8a99075a1416/lib/python3.10/site-packages (4.1.4)\n","Requirement already satisfied: msal<2,>=1.9.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-kusto-data) (1.21.0)\n","Requirement already satisfied: python-dateutil>=2.8.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-kusto-data) (2.8.2)\n","Requirement already satisfied: azure-core<2,>=1.11.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-kusto-data) (1.26.4)\n","Requirement already satisfied: requests>=2.13.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-kusto-data) (2.28.2)\n","Requirement already satisfied: ijson~=3.1 in /nfs4/pyenv-27214bb4-edfd-4fdd-b888-8a99075a1416/lib/python3.10/site-packages (from azure-kusto-data) (3.2.0.post0)\n","Requirement already satisfied: azure-identity<2,>=1.5.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-kusto-data) (1.12.0)\n","Requirement already satisfied: six>=1.11.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-core<2,>=1.11.0->azure-kusto-data) (1.16.0)\n","Requirement already satisfied: typing-extensions>=4.3.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-core<2,>=1.11.0->azure-kusto-data) (4.5.0)\n","Requirement already satisfied: cryptography>=2.5 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-identity<2,>=1.5.0->azure-kusto-data) (40.0.1)\n","Requirement already satisfied: msal-extensions<2.0.0,>=0.3.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from azure-identity<2,>=1.5.0->azure-kusto-data) (1.0.0)\n","Requirement already satisfied: PyJWT[crypto]<3,>=1.0.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from msal<2,>=1.9.0->azure-kusto-data) (2.6.0)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.13.0->azure-kusto-data) (1.26.14)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.13.0->azure-kusto-data) (2.1.1)\n","Requirement already satisfied: idna<4,>=2.5 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.13.0->azure-kusto-data) (3.4)\n","Requirement already satisfied: certifi>=2017.4.17 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from requests>=2.13.0->azure-kusto-data) (2022.12.7)\n","Requirement already satisfied: cffi>=1.12 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from cryptography>=2.5->azure-identity<2,>=1.5.0->azure-kusto-data) (1.15.1)\n","Requirement already satisfied: portalocker<3,>=1.0 in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from msal-extensions<2.0.0,>=0.3.0->azure-identity<2,>=1.5.0->azure-kusto-data) (2.7.0)\n","Requirement already satisfied: pycparser in /home/trusted-service-user/cluster-env/trident_env/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=2.5->azure-identity<2,>=1.5.0->azure-kusto-data) (2.21)\n","\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n","\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49m/nfs4/pyenv-27214bb4-edfd-4fdd-b888-8a99075a1416/bin/python -m pip install --upgrade pip\u001b[0m\n","Note: you may need to restart the kernel to use updated packages.\n"]},{"data":{},"execution_count":17,"metadata":{},"output_type":"execute_result"},{"name":"stdout","output_type":"stream","text":["Warning: PySpark kernel has been restarted to use updated packages.\n","\n"]}],"source":["%pip install azure-kusto-data"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["### Download precomputed Embeddings\n","\n"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["In this section we are going to load prepared embedding data, so you don't have to recompute the embeddings of Wikipedia articles with your own credits.\n"]},{"cell_type":"code","execution_count":5,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:25:28.6496025Z","execution_start_time":"2023-05-10T09:25:23.4822208Z","livy_statement_state":"available","parent_msg_id":"2d10f66b-3875-426b-abdf-b88b2d64d7ec","queued_time":"2023-05-10T09:24:44.1318823Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":17},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 17, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{"text/plain":["'vector_database_wikipedia_articles_embedded.zip'"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["import wget\n","\n","embeddings_url = \"https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip\"\n","\n","# The file is ~700 MB so this will take some time\n","wget.download(embeddings_url)"]},{"cell_type":"code","execution_count":6,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:25:50.6601867Z","execution_start_time":"2023-05-10T09:25:28.9912827Z","livy_statement_state":"available","parent_msg_id":"093e0086-0bc4-4fb8-b656-3484f31b675e","queued_time":"2023-05-10T09:24:44.6002568Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":18},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 18, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["\n","import zipfile\n","\n","with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n"," zip_ref.extractall(\"/lakehouse/default/Files/data\")"]},{"cell_type":"code","execution_count":7,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:31:38.5924721Z","execution_start_time":"2023-05-10T09:25:50.9770732Z","livy_statement_state":"available","parent_msg_id":"fab21836-4966-4068-8725-8fac982ca969","queued_time":"2023-05-10T09:24:44.8734372Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":19},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 19, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>url</th>\n"," <th>title</th>\n"," <th>text</th>\n"," <th>title_vector</th>\n"," <th>content_vector</th>\n"," <th>vector_id</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1</td>\n"," <td>https://simple.wikipedia.org/wiki/April</td>\n"," <td>April</td>\n"," <td>April is the fourth month of the year in the J...</td>\n"," <td>[0.001009464613161981, -0.020700545981526375, ...</td>\n"," <td>[-0.011253940872848034, -0.013491976074874401,...</td>\n"," <td>0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>2</td>\n"," <td>https://simple.wikipedia.org/wiki/August</td>\n"," <td>August</td>\n"," <td>August (Aug.) is the eighth month of the year ...</td>\n"," <td>[0.0009286514250561595, 0.000820168002974242, ...</td>\n"," <td>[0.0003609954728744924, 0.007262262050062418, ...</td>\n"," <td>1</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>6</td>\n"," <td>https://simple.wikipedia.org/wiki/Art</td>\n"," <td>Art</td>\n"," <td>Art is a creative activity that expresses imag...</td>\n"," <td>[0.003393713850528002, 0.0061537534929811954, ...</td>\n"," <td>[-0.004959689453244209, 0.015772193670272827, ...</td>\n"," <td>2</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>8</td>\n"," <td>https://simple.wikipedia.org/wiki/A</td>\n"," <td>A</td>\n"," <td>A or a is the first letter of the English alph...</td>\n"," <td>[0.0153952119871974, -0.013759135268628597, 0....</td>\n"," <td>[0.024894846603274345, -0.022186409682035446, ...</td>\n"," <td>3</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>9</td>\n"," <td>https://simple.wikipedia.org/wiki/Air</td>\n"," <td>Air</td>\n"," <td>Air refers to the Earth's atmosphere. Air is a...</td>\n"," <td>[0.02224554680287838, -0.02044147066771984, -0...</td>\n"," <td>[0.021524671465158463, 0.018522677943110466, -...</td>\n"," <td>4</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" id url title \\\n","0 1 https://simple.wikipedia.org/wiki/April April \n","1 2 https://simple.wikipedia.org/wiki/August August \n","2 6 https://simple.wikipedia.org/wiki/Art Art \n","3 8 https://simple.wikipedia.org/wiki/A A \n","4 9 https://simple.wikipedia.org/wiki/Air Air \n","\n"," text \\\n","0 April is the fourth month of the year in the J... \n","1 August (Aug.) is the eighth month of the year ... \n","2 Art is a creative activity that expresses imag... \n","3 A or a is the first letter of the English alph... \n","4 Air refers to the Earth's atmosphere. Air is a... \n","\n"," title_vector \\\n","0 [0.001009464613161981, -0.020700545981526375, ... \n","1 [0.0009286514250561595, 0.000820168002974242, ... \n","2 [0.003393713850528002, 0.0061537534929811954, ... \n","3 [0.0153952119871974, -0.013759135268628597, 0.... \n","4 [0.02224554680287838, -0.02044147066771984, -0... \n","\n"," content_vector vector_id \n","0 [-0.011253940872848034, -0.013491976074874401,... 0 \n","1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n","2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n","3 [0.024894846603274345, -0.022186409682035446, ... 3 \n","4 [0.021524671465158463, 0.018522677943110466, -... 4 "]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","from ast import literal_eval\n","\n","article_df = pd.read_csv('/lakehouse/default/Files/data/vector_database_wikipedia_articles_embedded.csv')\n","# Read vectors from strings back into a list\n","article_df[\"title_vector\"] = article_df.title_vector.apply(literal_eval)\n","article_df[\"content_vector\"] = article_df.content_vector.apply(literal_eval)\n","article_df.head()\n"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["### Store vectors in a Kusto table\n"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["Create a table & load the vectors in Kusto based on the contents in the dataframe. The spark option CreakeIfNotExists will automatically create a table if it doesn't exist\n"]},{"cell_type":"code","execution_count":20,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:41:07.7982707Z","execution_start_time":"2023-05-10T09:41:07.4379451Z","livy_statement_state":"available","parent_msg_id":"0a5fca1c-3aae-4b95-86c8-e098d0369f95","queued_time":"2023-05-10T09:41:07.1656842Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":37},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 37, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["# replace with your AAD Tenant ID, Kusto Cluster URI, Kusto DB name and Kusto Table\n","AAD_TENANT_ID = \"\"\n","KUSTO_CLUSTER = \"\"\n","KUSTO_DATABASE = \"Vector\"\n","KUSTO_TABLE = \"Wiki\""]},{"cell_type":"code","execution_count":9,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:31:43.2629717Z","execution_start_time":"2023-05-10T09:31:39.5426716Z","livy_statement_state":"available","parent_msg_id":"ece408d2-5f62-4a31-acfb-530070444031","queued_time":"2023-05-10T09:24:45.2865763Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":21},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 21, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["\n","kustoOptions = {\"kustoCluster\": KUSTO_CLUSTER, \"kustoDatabase\" :KUSTO_DATABASE, \"kustoTable\" : KUSTO_TABLE }\n","\n","# Replace the auth method based on your desired authentication mechanism - https://github.com/Azure/azure-kusto-spark/blob/master/docs/Authentication.md\n","access_token=mssparkutils.credentials.getToken(kustoOptions[\"kustoCluster\"])"]},{"cell_type":"code","execution_count":10,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:31:48.9063385Z","execution_start_time":"2023-05-10T09:31:43.5930348Z","livy_statement_state":"available","parent_msg_id":"421b9d33-f3ce-4f20-a8d4-5d0958f240c9","queued_time":"2023-05-10T09:24:45.5199822Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":22},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 22, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/spark/python/lib/pyspark.zip/pyspark/sql/pandas/conversion.py:604: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.\n"]}],"source":["#Pandas data frame to spark dataframe\n","sparkDF=spark.createDataFrame(article_df)"]},{"cell_type":"code","execution_count":11,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:34:35.1205454Z","execution_start_time":"2023-05-10T09:31:49.2428848Z","livy_statement_state":"available","parent_msg_id":"7e182f67-e09f-4aa4-9fca-d67b13f01f19","queued_time":"2023-05-10T09:24:45.7895854Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[{"completionTime":"2023-05-10T09:32:55.544GMT","dataRead":0,"dataWritten":0,"description":"Job group for statement 23:\n# Write data to a Kusto table\nsparkDF.write. format(\"com.microsoft.kusto.spark.synapse.datasource\"). option(\"kustoCluster\",kustoOptions[\"kustoCluster\"]). option(\"kustoDatabase\",kustoOptions[\"kustoDatabase\"]). option(\"kustoTable\", kustoOptions[\"kustoTable\"]). option(\"accessToken\", access_token). option(\"tableCreateOptions\", \"CreateIfNotExist\").mode(\"Append\"). save()\n","jobGroup":"23","jobId":7,"killedTasksSummary":{},"name":"foreachPartition at KustoWriter.scala:116","numActiveStages":0,"numActiveTasks":0,"numCompletedIndices":8,"numCompletedStages":1,"numCompletedTasks":8,"numFailedStages":0,"numFailedTasks":0,"numKilledTasks":0,"numSkippedStages":0,"numSkippedTasks":0,"numTasks":8,"rowCount":0,"stageIds":[10],"status":"SUCCEEDED","submissionTime":"2023-05-10T09:32:00.547GMT"}],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":1,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":23},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 23, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["# Write data to a Kusto table\n","sparkDF.write. \\\n","format(\"com.microsoft.kusto.spark.synapse.datasource\"). \\\n","option(\"kustoCluster\",kustoOptions[\"kustoCluster\"]). \\\n","option(\"kustoDatabase\",kustoOptions[\"kustoDatabase\"]). \\\n","option(\"kustoTable\", kustoOptions[\"kustoTable\"]). \\\n","option(\"accessToken\", access_token). \\\n","option(\"tableCreateOptions\", \"CreateIfNotExist\").\\\n","mode(\"Append\"). \\\n","save()\n"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["### Prepare your OpenAI API key\n","# "]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["The OpenAI API key is used for vectorization of the documents and queries. You can follow the instructions to create and retrieve your Azure OpenAI key and endpoint. https://learn.microsoft.com/en-us/azure/cognitive-services/openai/tutorials/embeddings\n","\n","\n","Please make sure to use the `text-embedding-3-small` model. Since the precomputed embeddings were created with `text-embedding-3-small` model we also have to use it during search.\n"]},{"cell_type":"code","execution_count":26,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:41:51.4743874Z","execution_start_time":"2023-05-10T09:41:51.1273995Z","livy_statement_state":"available","parent_msg_id":"53239dc7-36b8-4cbc-a4db-ce263729cd4f","queued_time":"2023-05-10T09:41:50.8249073Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":43},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 43, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["import openai"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["#### If using Azure Open AI"]},{"cell_type":"code","execution_count":27,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:41:53.0029722Z","execution_start_time":"2023-05-10T09:41:52.6610261Z","livy_statement_state":"available","parent_msg_id":"9546736c-bebc-411d-a50a-6cf69eb5e70d","queued_time":"2023-05-10T09:41:52.3687239Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":44},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 44, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["openai.api_version = '2022-12-01'\n","openai.api_base = '' # Please add your endpoint here\n","openai.api_type = 'azure'\n","openai.api_key = '' # Please add your api key here\n","\n","def embed(query):\n"," # Creates embedding vector from user query\n"," embedded_query = openai.Embedding.create(\n"," input=query,\n"," deployment_id=\"embed\", #replace with your deployment id\n"," chunk_size=1\n"," )[\"data\"][0][\"embedding\"]\n"," return embedded_query"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["#### If using Open AI"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["Only run this cell if you plan to use Open AI for embedding"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["openai.api_key = \"\"\n","\n","\n","def embed(query):\n"," # Creates embedding vector from user query\n"," embedded_query = openai.Embedding.create(\n"," input=query,\n"," model=\"text-embedding-3-small\",\n"," )[\"data\"][0][\"embedding\"]\n"," return embedded_query"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Generate embedding for the search term"]},{"cell_type":"code","execution_count":28,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:41:56.2227563Z","execution_start_time":"2023-05-10T09:41:55.1887813Z","livy_statement_state":"available","parent_msg_id":"1001b7dc-2b09-421e-aca4-e7a2b17c7d0e","queued_time":"2023-05-10T09:41:54.8728344Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":45},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 45, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["\n","searchedEmbedding = embed(\"places where you worship\")\n","#print(searchedEmbedding)"]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["#### Semantic search in Kusto "]},{"attachments":{},"cell_type":"markdown","metadata":{"nteract":{"transient":{"deleting":false}}},"source":["We will search the Kusto table for the closest vectors.\n","\n","We will be using the series-cosine-similarity-fl UDF for similarity search. \n","\n","Please create the function in your database before proceeding -\n","https://learn.microsoft.com/en-us/azure/data-explorer/kusto/functions-library/series-cosine-similarity-fl?tabs=query-defined"]},{"cell_type":"code","execution_count":18,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:40:52.3506224Z","execution_start_time":"2023-05-10T09:40:51.397278Z","livy_statement_state":"available","parent_msg_id":"5c692f91-d611-4f22-a2f3-e008bb8d89d8","queued_time":"2023-05-10T09:40:46.211323Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":35},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 35, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["from azure.kusto.data import KustoClient, KustoConnectionStringBuilder\n","from azure.kusto.data.exceptions import KustoServiceError\n","from azure.kusto.data.helpers import dataframe_from_result_table\n","import pandas as pd"]},{"cell_type":"code","execution_count":21,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:41:14.0650747Z","execution_start_time":"2023-05-10T09:41:13.6897398Z","livy_statement_state":"available","parent_msg_id":"2e004b80-df5e-4873-8dc4-dc9b0fcdfd1b","queued_time":"2023-05-10T09:41:13.4128259Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":38},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 38, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["KCSB = KustoConnectionStringBuilder.with_aad_device_authentication(\n"," KUSTO_CLUSTER)\n","KCSB.authority_id = AAD_TENANT_ID"]},{"cell_type":"code","execution_count":22,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:41:17.1788945Z","execution_start_time":"2023-05-10T09:41:16.7709048Z","livy_statement_state":"available","parent_msg_id":"7b74117d-03a9-4963-9fbf-83dcf5eb33a8","queued_time":"2023-05-10T09:41:16.4790182Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":39},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 39, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["KUSTO_CLIENT = KustoClient(KCSB)"]},{"cell_type":"code","execution_count":31,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:45:56.5295656Z","execution_start_time":"2023-05-10T09:45:54.8312637Z","livy_statement_state":"available","parent_msg_id":"848897fc-69b2-4108-8cb7-844d8dd473d7","queued_time":"2023-05-10T09:45:54.458869Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":48},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 48, Finished, Available)"]},"metadata":{},"output_type":"display_data"}],"source":["KUSTO_QUERY = \"Wiki | extend similarity = series_cosine_similarity_fl(dynamic(\"+str(searchedEmbedding)+\"), content_vector,1,1) | top 10 by similarity desc \"\n","\n","RESPONSE = KUSTO_CLIENT.execute(KUSTO_DATABASE, KUSTO_QUERY)"]},{"cell_type":"code","execution_count":32,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:46:02.8573664Z","execution_start_time":"2023-05-10T09:46:02.496154Z","livy_statement_state":"available","parent_msg_id":"36d4dd76-ca76-4ce0-aa35-5cd8d7c26b6f","queued_time":"2023-05-10T09:46:02.2249009Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":49},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 49, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>url</th>\n"," <th>title</th>\n"," <th>text</th>\n"," <th>title_vector</th>\n"," <th>content_vector</th>\n"," <th>vector_id</th>\n"," <th>similarity</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>852</td>\n"," <td>https://simple.wikipedia.org/wiki/Temple</td>\n"," <td>Temple</td>\n"," <td>A temple is a building where people go to prac...</td>\n"," <td>[-0.021837441250681877, -0.007722342386841774,...</td>\n"," <td>[-0.0019541378132998943, 0.007151313126087189,...</td>\n"," <td>413</td>\n"," <td>0.834495</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>78094</td>\n"," <td>https://simple.wikipedia.org/wiki/Christian%20...</td>\n"," <td>Christian worship</td>\n"," <td>In Christianity, worship has been thought as b...</td>\n"," <td>[0.0017675267299637198, -0.008890199474990368,...</td>\n"," <td>[0.020530683919787407, 0.0024345638230443, -0....</td>\n"," <td>20320</td>\n"," <td>0.832132</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>59154</td>\n"," <td>https://simple.wikipedia.org/wiki/Service%20of...</td>\n"," <td>Service of worship</td>\n"," <td>A service of worship is a religious meeting wh...</td>\n"," <td>[-0.007969820871949196, 0.0004240311391185969,...</td>\n"," <td>[0.003784010885283351, -0.0030924836173653603,...</td>\n"," <td>15519</td>\n"," <td>0.831633</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>51910</td>\n"," <td>https://simple.wikipedia.org/wiki/Worship</td>\n"," <td>Worship</td>\n"," <td>Worship is a word often used in religion. It ...</td>\n"," <td>[0.0036036288365721703, -0.01276545226573944, ...</td>\n"," <td>[0.007925753481686115, -0.0110504487529397, 0....</td>\n"," <td>14010</td>\n"," <td>0.828185</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>29576</td>\n"," <td>https://simple.wikipedia.org/wiki/Altar</td>\n"," <td>Altar</td>\n"," <td>An altar is a place, often a table, where a re...</td>\n"," <td>[0.007887467741966248, -0.02706138789653778, -...</td>\n"," <td>[0.023901859298348427, -0.031175222247838977, ...</td>\n"," <td>8708</td>\n"," <td>0.824124</td>\n"," </tr>\n"," <tr>\n"," <th>5</th>\n"," <td>92507</td>\n"," <td>https://simple.wikipedia.org/wiki/Shrine</td>\n"," <td>Shrine</td>\n"," <td>A shrine is a holy or sacred place with someth...</td>\n"," <td>[-0.011601685546338558, 0.006366696208715439, ...</td>\n"," <td>[0.016423320397734642, -0.0015560361789539456,...</td>\n"," <td>23945</td>\n"," <td>0.823863</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>815</td>\n"," <td>https://simple.wikipedia.org/wiki/Synagogue</td>\n"," <td>Synagogue</td>\n"," <td>A synagogue is a place where Jews meet to wors...</td>\n"," <td>[-0.017317570745944977, 0.0022673190105706453,...</td>\n"," <td>[-0.004515442531555891, 0.003739549545571208, ...</td>\n"," <td>398</td>\n"," <td>0.819942</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>68080</td>\n"," <td>https://simple.wikipedia.org/wiki/Shinto%20shrine</td>\n"," <td>Shinto shrine</td>\n"," <td>A Shinto shrine is a sacred place or site wher...</td>\n"," <td>[0.0035740730818361044, 0.0028098472394049168,...</td>\n"," <td>[0.011014971882104874, 0.00042272370774298906,...</td>\n"," <td>18106</td>\n"," <td>0.818475</td>\n"," </tr>\n"," <tr>\n"," <th>8</th>\n"," <td>57790</td>\n"," <td>https://simple.wikipedia.org/wiki/Chapel</td>\n"," <td>Chapel</td>\n"," <td>A chapel is a place for Christian worship. The...</td>\n"," <td>[-0.01371884811669588, 0.0031672674231231213, ...</td>\n"," <td>[0.002526090247556567, 0.02482965588569641, 0....</td>\n"," <td>15260</td>\n"," <td>0.817608</td>\n"," </tr>\n"," <tr>\n"," <th>9</th>\n"," <td>142</td>\n"," <td>https://simple.wikipedia.org/wiki/Church%20%28...</td>\n"," <td>Church (building)</td>\n"," <td>A church is a building that was constructed to...</td>\n"," <td>[0.0021336888894438744, 0.0029748091474175453,...</td>\n"," <td>[0.016109377145767212, 0.022908871993422508, 0...</td>\n"," <td>74</td>\n"," <td>0.812636</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" id url \\\n","0 852 https://simple.wikipedia.org/wiki/Temple \n","1 78094 https://simple.wikipedia.org/wiki/Christian%20... \n","2 59154 https://simple.wikipedia.org/wiki/Service%20of... \n","3 51910 https://simple.wikipedia.org/wiki/Worship \n","4 29576 https://simple.wikipedia.org/wiki/Altar \n","5 92507 https://simple.wikipedia.org/wiki/Shrine \n","6 815 https://simple.wikipedia.org/wiki/Synagogue \n","7 68080 https://simple.wikipedia.org/wiki/Shinto%20shrine \n","8 57790 https://simple.wikipedia.org/wiki/Chapel \n","9 142 https://simple.wikipedia.org/wiki/Church%20%28... \n","\n"," title text \\\n","0 Temple A temple is a building where people go to prac... \n","1 Christian worship In Christianity, worship has been thought as b... \n","2 Service of worship A service of worship is a religious meeting wh... \n","3 Worship Worship is a word often used in religion. It ... \n","4 Altar An altar is a place, often a table, where a re... \n","5 Shrine A shrine is a holy or sacred place with someth... \n","6 Synagogue A synagogue is a place where Jews meet to wors... \n","7 Shinto shrine A Shinto shrine is a sacred place or site wher... \n","8 Chapel A chapel is a place for Christian worship. The... \n","9 Church (building) A church is a building that was constructed to... \n","\n"," title_vector \\\n","0 [-0.021837441250681877, -0.007722342386841774,... \n","1 [0.0017675267299637198, -0.008890199474990368,... \n","2 [-0.007969820871949196, 0.0004240311391185969,... \n","3 [0.0036036288365721703, -0.01276545226573944, ... \n","4 [0.007887467741966248, -0.02706138789653778, -... \n","5 [-0.011601685546338558, 0.006366696208715439, ... \n","6 [-0.017317570745944977, 0.0022673190105706453,... \n","7 [0.0035740730818361044, 0.0028098472394049168,... \n","8 [-0.01371884811669588, 0.0031672674231231213, ... \n","9 [0.0021336888894438744, 0.0029748091474175453,... \n","\n"," content_vector vector_id similarity \n","0 [-0.0019541378132998943, 0.007151313126087189,... 413 0.834495 \n","1 [0.020530683919787407, 0.0024345638230443, -0.... 20320 0.832132 \n","2 [0.003784010885283351, -0.0030924836173653603,... 15519 0.831633 \n","3 [0.007925753481686115, -0.0110504487529397, 0.... 14010 0.828185 \n","4 [0.023901859298348427, -0.031175222247838977, ... 8708 0.824124 \n","5 [0.016423320397734642, -0.0015560361789539456,... 23945 0.823863 \n","6 [-0.004515442531555891, 0.003739549545571208, ... 398 0.819942 \n","7 [0.011014971882104874, 0.00042272370774298906,... 18106 0.818475 \n","8 [0.002526090247556567, 0.02482965588569641, 0.... 15260 0.817608 \n","9 [0.016109377145767212, 0.022908871993422508, 0... 74 0.812636 "]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["df = dataframe_from_result_table(RESPONSE.primary_results[0])\n","df"]},{"cell_type":"code","execution_count":null,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":["searchedEmbedding = embed(\"unfortunate events in history\")\n"]},{"cell_type":"code","execution_count":35,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[{"data":{"application/vnd.livy.statement-meta+json":{"execution_finish_time":"2023-05-10T09:48:43.6147167Z","execution_start_time":"2023-05-10T09:48:41.8371731Z","livy_statement_state":"available","parent_msg_id":"3dd276f3-a68e-44f7-8205-253d772aea18","queued_time":"2023-05-10T09:48:41.4581244Z","session_id":"7e5070d2-4560-4fb8-a3a8-6a594acd58ab","session_start_time":null,"spark_jobs":{"jobs":[],"limit":20,"numbers":{"FAILED":0,"RUNNING":0,"SUCCEEDED":0,"UNKNOWN":0},"rule":"ALL_DESC"},"spark_pool":null,"state":"finished","statement_id":52},"text/plain":["StatementMeta(, 7e5070d2-4560-4fb8-a3a8-6a594acd58ab, 52, Finished, Available)"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>url</th>\n"," <th>title</th>\n"," <th>text</th>\n"," <th>title_vector</th>\n"," <th>content_vector</th>\n"," <th>vector_id</th>\n"," <th>similarity</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>848</td>\n"," <td>https://simple.wikipedia.org/wiki/Tragedy</td>\n"," <td>Tragedy</td>\n"," <td>In theatre, a tragedy as defined by Aristotle ...</td>\n"," <td>[-0.019502468407154083, -0.010160734876990318,...</td>\n"," <td>[-0.012951433658599854, -0.018836138769984245,...</td>\n"," <td>410</td>\n"," <td>0.851848</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>4469</td>\n"," <td>https://simple.wikipedia.org/wiki/The%20Holocaust</td>\n"," <td>The Holocaust</td>\n"," <td>The Holocaust, sometimes called The Shoah (), ...</td>\n"," <td>[-0.030233195051550865, -0.024401605129241943,...</td>\n"," <td>[-0.016398731619119644, -0.013267949223518372,...</td>\n"," <td>1203</td>\n"," <td>0.847222</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>64216</td>\n"," <td>https://simple.wikipedia.org/wiki/List%20of%20...</td>\n"," <td>List of historical plagues</td>\n"," <td>This list contains famous or well documented o...</td>\n"," <td>[-0.010667890310287476, -0.0003575817099772393...</td>\n"," <td>[-0.010863155126571655, -0.0012196656316518784...</td>\n"," <td>16859</td>\n"," <td>0.844411</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>4397</td>\n"," <td>https://simple.wikipedia.org/wiki/List%20of%20...</td>\n"," <td>List of disasters</td>\n"," <td>This is a list of disasters, both natural and ...</td>\n"," <td>[-0.02713736332952976, -0.005278210621327162, ...</td>\n"," <td>[-0.023679986596107483, -0.006126823835074902,...</td>\n"," <td>1158</td>\n"," <td>0.843063</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>23073</td>\n"," <td>https://simple.wikipedia.org/wiki/Disaster</td>\n"," <td>Disaster</td>\n"," <td>A disaster is something very not good that hap...</td>\n"," <td>[-0.018235962837934497, -0.020034968852996823,...</td>\n"," <td>[-0.02504003793001175, 0.007415903266519308, 0...</td>\n"," <td>7251</td>\n"," <td>0.840334</td>\n"," </tr>\n"," <tr>\n"," <th>5</th>\n"," <td>4382</td>\n"," <td>https://simple.wikipedia.org/wiki/List%20of%20...</td>\n"," <td>List of terrorist incidents</td>\n"," <td>The following is a list by date of acts and fa...</td>\n"," <td>[-0.03989032283425331, -0.012808636762201786, ...</td>\n"," <td>[-0.045838188380002975, -0.01682935282588005, ...</td>\n"," <td>1149</td>\n"," <td>0.836162</td>\n"," </tr>\n"," <tr>\n"," <th>6</th>\n"," <td>13528</td>\n"," <td>https://simple.wikipedia.org/wiki/A%20Series%2...</td>\n"," <td>A Series of Unfortunate Events</td>\n"," <td>A Series of Unfortunate Events is a series of ...</td>\n"," <td>[0.0010618815431371331, -0.0267023965716362, -...</td>\n"," <td>[0.002801976166665554, -0.02904471382498741, -...</td>\n"," <td>4347</td>\n"," <td>0.835172</td>\n"," </tr>\n"," <tr>\n"," <th>7</th>\n"," <td>42874</td>\n"," <td>https://simple.wikipedia.org/wiki/History%20of...</td>\n"," <td>History of the world</td>\n"," <td>The history of the world (also called human hi...</td>\n"," <td>[0.0026915925554931164, -0.022206028923392296,...</td>\n"," <td>[0.013645033352077007, -0.005165994167327881, ...</td>\n"," <td>11672</td>\n"," <td>0.830243</td>\n"," </tr>\n"," <tr>\n"," <th>8</th>\n"," <td>4452</td>\n"," <td>https://simple.wikipedia.org/wiki/Accident</td>\n"," <td>Accident</td>\n"," <td>An accident is when something goes wrong when ...</td>\n"," <td>[-0.004075294826179743, -0.0059883203357458115...</td>\n"," <td>[0.00926120299845934, 0.013705797493457794, 0....</td>\n"," <td>1190</td>\n"," <td>0.826898</td>\n"," </tr>\n"," <tr>\n"," <th>9</th>\n"," <td>324</td>\n"," <td>https://simple.wikipedia.org/wiki/History</td>\n"," <td>History</td>\n"," <td>History is the study of past events. People kn...</td>\n"," <td>[0.006603690329939127, -0.011856242083013058, ...</td>\n"," <td>[0.0048830462619662285, 0.0032003086525946856,...</td>\n"," <td>170</td>\n"," <td>0.824645</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" id url \\\n","0 848 https://simple.wikipedia.org/wiki/Tragedy \n","1 4469 https://simple.wikipedia.org/wiki/The%20Holocaust \n","2 64216 https://simple.wikipedia.org/wiki/List%20of%20... \n","3 4397 https://simple.wikipedia.org/wiki/List%20of%20... \n","4 23073 https://simple.wikipedia.org/wiki/Disaster \n","5 4382 https://simple.wikipedia.org/wiki/List%20of%20... \n","6 13528 https://simple.wikipedia.org/wiki/A%20Series%2... \n","7 42874 https://simple.wikipedia.org/wiki/History%20of... \n","8 4452 https://simple.wikipedia.org/wiki/Accident \n","9 324 https://simple.wikipedia.org/wiki/History \n","\n"," title \\\n","0 Tragedy \n","1 The Holocaust \n","2 List of historical plagues \n","3 List of disasters \n","4 Disaster \n","5 List of terrorist incidents \n","6 A Series of Unfortunate Events \n","7 History of the world \n","8 Accident \n","9 History \n","\n"," text \\\n","0 In theatre, a tragedy as defined by Aristotle ... \n","1 The Holocaust, sometimes called The Shoah (), ... \n","2 This list contains famous or well documented o... \n","3 This is a list of disasters, both natural and ... \n","4 A disaster is something very not good that hap... \n","5 The following is a list by date of acts and fa... \n","6 A Series of Unfortunate Events is a series of ... \n","7 The history of the world (also called human hi... \n","8 An accident is when something goes wrong when ... \n","9 History is the study of past events. People kn... \n","\n"," title_vector \\\n","0 [-0.019502468407154083, -0.010160734876990318,... \n","1 [-0.030233195051550865, -0.024401605129241943,... \n","2 [-0.010667890310287476, -0.0003575817099772393... \n","3 [-0.02713736332952976, -0.005278210621327162, ... \n","4 [-0.018235962837934497, -0.020034968852996823,... \n","5 [-0.03989032283425331, -0.012808636762201786, ... \n","6 [0.0010618815431371331, -0.0267023965716362, -... \n","7 [0.0026915925554931164, -0.022206028923392296,... \n","8 [-0.004075294826179743, -0.0059883203357458115... \n","9 [0.006603690329939127, -0.011856242083013058, ... \n","\n"," content_vector vector_id similarity \n","0 [-0.012951433658599854, -0.018836138769984245,... 410 0.851848 \n","1 [-0.016398731619119644, -0.013267949223518372,... 1203 0.847222 \n","2 [-0.010863155126571655, -0.0012196656316518784... 16859 0.844411 \n","3 [-0.023679986596107483, -0.006126823835074902,... 1158 0.843063 \n","4 [-0.02504003793001175, 0.007415903266519308, 0... 7251 0.840334 \n","5 [-0.045838188380002975, -0.01682935282588005, ... 1149 0.836162 \n","6 [0.002801976166665554, -0.02904471382498741, -... 4347 0.835172 \n","7 [0.013645033352077007, -0.005165994167327881, ... 11672 0.830243 \n","8 [0.00926120299845934, 0.013705797493457794, 0.... 1190 0.826898 \n","9 [0.0048830462619662285, 0.0032003086525946856,... 170 0.824645 "]},"execution_count":39,"metadata":{},"output_type":"execute_result"}],"source":["KUSTO_QUERY = \"Wiki | extend similarity = series_cosine_similarity_fl(dynamic(\"+str(searchedEmbedding)+\"), title_vector,1,1) | top 10 by similarity desc \"\n","RESPONSE = KUSTO_CLIENT.execute(KUSTO_DATABASE, KUSTO_QUERY)\n","\n","df = dataframe_from_result_table(RESPONSE.primary_results[0])\n","df"]}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Synapse PySpark","language":"Python","name":"synapse_pyspark"},"language_info":{"name":"python"},"notebook_environment":{},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{},"enableDebugMode":false}},"synapse_widget":{"state":{},"version":"0.1"},"trident":{"lakehouse":{"default_lakehouse":"7a5ec84c-e155-4d5f-a4e6-d49f4628d60b","default_lakehouse_name":"TempLH","default_lakehouse_workspace_id":"14a81bdb-d1c4-4808-9396-b12faab604c3","known_lakehouses":[{"id":"7a5ec84c-e155-4d5f-a4e6-d49f4628d60b"}]}},"widgets":{}},"nbformat":4,"nbformat_minor":0}