Add docker-compose and cleanup

pull/131/head
Sam Partee 1 year ago
parent 36cf06fc0a
commit d0dfca2b95

@ -1245,7 +1245,8 @@
"Start a version of Redis with RediSearch (Redis Stack) by running the following docker command\n",
"\n",
"```bash\n",
"$ docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n",
"$ cd redis\n",
"$ docker compose up -d\n",
"```\n",
"This also includes the [RedisInsight](https://redis.com/redis-enterprise/redis-insight/) GUI for managing your Redis database which you can view at [http://localhost:8001](http://localhost:8001) once you start the docker container.\n",
"\n",
@ -1254,7 +1255,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 134,
"id": "d2ce669a",
"metadata": {},
"outputs": [
@ -1264,7 +1265,7 @@
"True"
]
},
"execution_count": 12,
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
@ -1311,7 +1312,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 135,
"id": "a7c64cb9",
"metadata": {},
"outputs": [],
@ -1326,7 +1327,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 136,
"id": "d95fcd06",
"metadata": {},
"outputs": [],
@ -1356,7 +1357,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 137,
"id": "7418480d",
"metadata": {},
"outputs": [],
@ -1386,7 +1387,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 138,
"id": "e98d63ad",
"metadata": {},
"outputs": [],
@ -1409,7 +1410,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 139,
"id": "098d3c5a",
"metadata": {},
"outputs": [
@ -1442,7 +1443,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 140,
"id": "508d1f89",
"metadata": {},
"outputs": [],
@ -1483,7 +1484,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 142,
"id": "1f0eef07",
"metadata": {},
"outputs": [
@ -1494,25 +1495,25 @@
"0. Museum of Modern Art (Score: 0.875)\n",
"1. Western Europe (Score: 0.867)\n",
"2. Renaissance art (Score: 0.864)\n",
"3. Pop art (Score: 0.861)\n",
"3. Pop art (Score: 0.86)\n",
"4. Northern Europe (Score: 0.855)\n",
"5. Hellenistic art (Score: 0.853)\n",
"6. Modernist literature (Score: 0.847)\n",
"7. Art film (Score: 0.843)\n",
"8. Central Europe (Score: 0.843)\n",
"9. Art (Score: 0.842)\n"
"9. European (Score: 0.841)\n"
]
}
],
"source": [
"# For using OpenAI to generate query embedding\n",
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
"openai.api_key = os.getenv(\"OPENAI_API_KEY\", \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\")\n",
"results = search_redis(redis_client, 'modern art in Europe', k=10)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 143,
"id": "7b805a81",
"metadata": {},
"outputs": [

@ -88,6 +88,15 @@ RediSearch has a distributed cluster version that scales to billions of document
See [RediSearch on Redis Enterprise](https://redis.com/modules/redisearch/) for more information.
### Examples
- [Product Search](https://github.com/RedisVentures/redis-product-search) - eCommerce product search (with image and text)
- [Product Recommendations with DocArray / Jina](https://github.com/jina-ai/product-recommendation-redis-docarray) - Content-based product recommendations example with Redis and DocArray.
- [Redis VSS in RecSys](https://github.com/RedisVentures/Redis-Recsys) - 3 end-to-end Redis & NVIDIA Merlin Recommendation System Architectures.
- [Azure OpenAI Embeddings Q&A](https://github.com/ruoccofabrizio/azure-open-ai-embeddings-qna) - OpenAI and Redis as a Q&A service on Azure.
- [ArXiv Paper Search](https://github.com/RedisVentures/redis-arXiv-search) - Semantic search over arXiv scholarly papers
### More Resources
For more information on how to use Redis as a vector database, check out the following resources:

@ -0,0 +1,22 @@
version: '3.7'
services:
vector-db:
image: redis/redis-stack:latest
ports:
- 6379:6379
- 8001:8001
environment:
- REDISEARCH_ARGS=CONCURRENT_WRITE_MODE
volumes:
- vector-db:/var/lib/redis
- ./redis.conf:/usr/local/etc/redis/redis.conf
healthcheck:
test: ["CMD", "redis-cli", "-h", "localhost", "-p", "6379", "ping"]
interval: 2s
timeout: 1m30s
retries: 5
start_period: 5s
volumes:
vector-db:

@ -12,13 +12,13 @@
"\n",
"### What is Redis?\n",
"\n",
"Most developers from a web services background are probably familiar with Redis. At it's core, Redis is an open-source key-value store that can be used as a cache, message broker, and database. Developers choice Redis because it is fast, has a large ecosystem of client libraries, and has been deployed by major enterprises for years. \n",
"Most developers from a web services background are probably familiar with Redis. At it's core, Redis is an open-source key-value store that can be used as a cache, message broker, and database. Developers choice Redis because it is fast, has a large ecosystem of client libraries, and has been deployed by major enterprises for years.\n",
"\n",
"In addition to the traditional uses of Redis. Redis also provides [Redis Modules](https://redis.io/modules) which are a way to extend Redis with new data types and commands. Example modules include [RedisJSON](https://redis.io/docs/stack/json/), [RedisTimeSeries](https://redis.io/docs/stack/timeseries/), [RedisBloom](https://redis.io/docs/stack/bloom/) and [RediSearch](https://redis.io/docs/stack/search/).\n",
"\n",
"### What is RediSearch?\n",
"\n",
"RediSearch is a [Redis module](https://redis.io/modules) that provides querying, secondary indexing, full-text search and vector search for Redis. To use RediSearch, you first declare indexes on your Redis data. You can then use the RediSearch query language to query that data. For more information on the feature set of RediSearch, see the [README](./README.md) or the [RediSearch documentation](https://redis.io/docs/stack/search/).\n",
"RediSearch is a [Redis module](https://redis.io/modules) that provides querying, secondary indexing, full-text search and vector search for Redis. To use RediSearch, you first declare indexes on your Redis data. You can then use the RediSearch clients to query that data. For more information on the feature set of RediSearch, see the [README](./README.md) or the [RediSearch documentation](https://redis.io/docs/stack/search/).\n",
"\n",
"### Deployment options\n",
"\n",
@ -52,7 +52,7 @@
"To keep this example simple, we will use the Redis Stack docker container which we can start as follows\n",
"\n",
"```bash\n",
"$ docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n",
"$ docker compose up -d\n",
"```\n",
"\n",
"This also includes the [RedisInsight](https://redis.com/redis-enterprise/redis-insight/) GUI for managing your Redis database which you can view at [http://localhost:8001](http://localhost:8001) once you start the docker container.\n",
@ -73,45 +73,10 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "2b04113f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: redis in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (4.3.4)\n",
"Requirement already satisfied: wget in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (3.2)\n",
"Requirement already satisfied: pandas in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (1.5.1)\n",
"Requirement already satisfied: openai in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (0.26.4)\n",
"Requirement already satisfied: packaging>=20.4 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from redis) (21.3)\n",
"Requirement already satisfied: async-timeout>=4.0.2 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from redis) (4.0.2)\n",
"Requirement already satisfied: deprecated>=1.2.3 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from redis) (1.2.13)\n",
"Requirement already satisfied: pytz>=2020.1 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from pandas) (2022.6)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: numpy>=1.20.3 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from pandas) (1.22.4)\n",
"Requirement already satisfied: requests>=2.20 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from openai) (2.28.2)\n",
"Requirement already satisfied: aiohttp in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from openai) (3.8.3)\n",
"Requirement already satisfied: tqdm in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from openai) (4.64.1)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from deprecated>=1.2.3->redis) (1.14.1)\n",
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from packaging>=20.4->redis) (3.0.9)\n",
"Requirement already satisfied: six>=1.5 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from requests>=2.20->openai) (2022.12.7)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from requests>=2.20->openai) (3.4)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from requests>=2.20->openai) (2.1.1)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from requests>=2.20->openai) (1.26.14)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from aiohttp->openai) (1.8.2)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from aiohttp->openai) (6.0.4)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from aiohttp->openai) (1.3.3)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from aiohttp->openai) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /Users/sam.partee/.virtualenvs/redisvl2/lib/python3.8/site-packages (from aiohttp->openai) (22.1.0)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.2.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"outputs": [],
"source": [
"!pip install redis wget pandas openai"
]
@ -134,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 2,
"id": "88be138c",
"metadata": {},
"outputs": [
@ -153,7 +118,7 @@
"import openai\n",
"\n",
"# Note. alternatively you can set a temporary env variable like this:\n",
"# os.environ[\"OPENAI_API_KEY\"] = ''\n",
"# os.environ[\"OPENAI_API_KEY\"] = 'sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'\n",
"\n",
"if os.getenv(\"OPENAI_API_KEY\") is not None:\n",
" openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
@ -175,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 3,
"id": "9fbebe0d",
"metadata": {},
"outputs": [
@ -185,55 +150,143 @@
"text": [
"File Downloaded\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>text</th>\n",
" <th>title_vector</th>\n",
" <th>content_vector</th>\n",
" <th>vector_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>https://simple.wikipedia.org/wiki/April</td>\n",
" <td>April</td>\n",
" <td>April is the fourth month of the year in the J...</td>\n",
" <td>[0.001009464613161981, -0.020700545981526375, ...</td>\n",
" <td>[-0.011253940872848034, -0.013491976074874401,...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>https://simple.wikipedia.org/wiki/August</td>\n",
" <td>August</td>\n",
" <td>August (Aug.) is the eighth month of the year ...</td>\n",
" <td>[0.0009286514250561595, 0.000820168002974242, ...</td>\n",
" <td>[0.0003609954728744924, 0.007262262050062418, ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>https://simple.wikipedia.org/wiki/Art</td>\n",
" <td>Art</td>\n",
" <td>Art is a creative activity that expresses imag...</td>\n",
" <td>[0.003393713850528002, 0.0061537534929811954, ...</td>\n",
" <td>[-0.004959689453244209, 0.015772193670272827, ...</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8</td>\n",
" <td>https://simple.wikipedia.org/wiki/A</td>\n",
" <td>A</td>\n",
" <td>A or a is the first letter of the English alph...</td>\n",
" <td>[0.0153952119871974, -0.013759135268628597, 0....</td>\n",
" <td>[0.024894846603274345, -0.022186409682035446, ...</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>https://simple.wikipedia.org/wiki/Air</td>\n",
" <td>Air</td>\n",
" <td>Air refers to the Earth's atmosphere. Air is a...</td>\n",
" <td>[0.02224554680287838, -0.02044147066771984, -0...</td>\n",
" <td>[0.021524671465158463, 0.018522677943110466, -...</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id url title \\\n",
"0 1 https://simple.wikipedia.org/wiki/April April \n",
"1 2 https://simple.wikipedia.org/wiki/August August \n",
"2 6 https://simple.wikipedia.org/wiki/Art Art \n",
"3 8 https://simple.wikipedia.org/wiki/A A \n",
"4 9 https://simple.wikipedia.org/wiki/Air Air \n",
"\n",
" text \\\n",
"0 April is the fourth month of the year in the J... \n",
"1 August (Aug.) is the eighth month of the year ... \n",
"2 Art is a creative activity that expresses imag... \n",
"3 A or a is the first letter of the English alph... \n",
"4 Air refers to the Earth's atmosphere. Air is a... \n",
"\n",
" title_vector \\\n",
"0 [0.001009464613161981, -0.020700545981526375, ... \n",
"1 [0.0009286514250561595, 0.000820168002974242, ... \n",
"2 [0.003393713850528002, 0.0061537534929811954, ... \n",
"3 [0.0153952119871974, -0.013759135268628597, 0.... \n",
"4 [0.02224554680287838, -0.02044147066771984, -0... \n",
"\n",
" content_vector vector_id \n",
"0 [-0.011253940872848034, -0.013491976074874401,... 0 \n",
"1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n",
"2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n",
"3 [0.024894846603274345, -0.022186409682035446, ... 3 \n",
"4 [0.021524671465158463, 0.018522677943110466, -... 4 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import wget\n",
"import zipfile\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from typing import List\n",
"from ast import literal_eval\n",
"\n",
"\n",
"def get_data():\n",
"\n",
" file_name = '../../data/vector_database_wikipedia_articles_embedded.csv'\n",
" # check for file in this directory and the parent directory\n",
" if os.path.isfile(file_name):\n",
" print(\"File Downloaded\")\n",
" else:\n",
" # Download the data\n",
" data_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n",
" wget.download(data_url)\n",
"# use helper function in nbutils.py to download and read the data\n",
"# this should take from 5-10 min to run\n",
"if os.getcwd() not in sys.path:\n",
" sys.path.append(os.getcwd())\n",
"import nbutils\n",
"\n",
" # Unzip the data\n",
" with zipfile.ZipFile('vector_database_wikipedia_articles.zip', 'r') as zip_ref:\n",
" zip_ref.extractall('../data')\n",
"nbutils.download_wikipedia_data()\n",
"data = nbutils.read_wikipedia_data()\n",
"\n",
" # Remove the zip file\n",
" os.remove('vector_database_wikipedia_articles.zip')\n",
" print(\"File Downloaded\")\n",
"\n",
"get_data()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "0396150e",
"metadata": {},
"outputs": [],
"source": [
"# read in data\n",
"data = pd.read_csv('../../data/vector_database_wikipedia_articles_embedded.csv')\n",
"# Read vectors from strings back into a list\n",
"data['title_vector'] = data.title_vector.apply(literal_eval)\n",
"data['content_vector'] = data.content_vector.apply(literal_eval)\n",
"# Set vector_id to be a string\n",
"data['vector_id'] = data['vector_id'].apply(str)"
"data.head()"
]
},
{
@ -250,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 4,
"id": "cc662c1b",
"metadata": {},
"outputs": [
@ -260,7 +313,7 @@
"True"
]
},
"execution_count": 57,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -307,7 +360,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 5,
"id": "f894b911",
"metadata": {
"scrolled": true
@ -324,7 +377,7 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": 6,
"id": "15db8380",
"metadata": {},
"outputs": [],
@ -354,7 +407,7 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": 7,
"id": "3658693c",
"metadata": {},
"outputs": [],
@ -384,7 +437,7 @@
},
{
"cell_type": "code",
"execution_count": 87,
"execution_count": 8,
"id": "0d791186",
"metadata": {},
"outputs": [],
@ -407,7 +460,7 @@
},
{
"cell_type": "code",
"execution_count": 88,
"execution_count": 9,
"id": "5bfaeafa",
"metadata": {},
"outputs": [
@ -437,7 +490,7 @@
},
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 10,
"id": "b044aa93",
"metadata": {},
"outputs": [],
@ -480,7 +533,7 @@
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 11,
"id": "7e2025f6",
"metadata": {},
"outputs": [
@ -489,15 +542,15 @@
"output_type": "stream",
"text": [
"0. Museum of Modern Art (Score: 0.875)\n",
"1. Western Europe (Score: 0.867)\n",
"1. Western Europe (Score: 0.868)\n",
"2. Renaissance art (Score: 0.864)\n",
"3. Pop art (Score: 0.861)\n",
"3. Pop art (Score: 0.86)\n",
"4. Northern Europe (Score: 0.855)\n",
"5. Hellenistic art (Score: 0.853)\n",
"6. Modernist literature (Score: 0.847)\n",
"7. Art film (Score: 0.843)\n",
"8. Central Europe (Score: 0.843)\n",
"9. Art (Score: 0.842)\n"
"9. European (Score: 0.841)\n"
]
}
],
@ -508,7 +561,7 @@
},
{
"cell_type": "code",
"execution_count": 92,
"execution_count": 12,
"id": "93c4a696",
"metadata": {},
"outputs": [
@ -546,7 +599,7 @@
},
{
"cell_type": "code",
"execution_count": 93,
"execution_count": 13,
"id": "6c25ee8d",
"metadata": {},
"outputs": [
@ -577,7 +630,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": 14,
"id": "2c0d11d8",
"metadata": {},
"outputs": [
@ -598,7 +651,7 @@
"'In Europe, after the Middle Ages, there was a \"Renaissance\" which means \"rebirth\". People rediscovered science and artists were allowed to paint subjects other than religious subjects. People like Michelangelo and Leonardo da Vinci still painted religious pictures, but they also now could paint mythological pictures too. These artists also invented perspective where things in the distance look smaller in the picture. This was new because in the Middle Ages people would paint all the figures close up and just overlapping each other. These artists used nudity regularly in their art.'"
]
},
"execution_count": 94,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -627,12 +680,14 @@
"\n",
"Up until now, we've been using the ``FLAT`` or \"brute-force\" index to run our queries. Redis also supports the ``HNSW`` index which is a fast, approximate index. The ``HNSW`` index is a graph-based index that uses a hierarchical navigable small world graph to store vectors. The ``HNSW`` index is a good choice for large datasets where you want to run approximate queries.\n",
"\n",
"``HNSW`` will take longer to build and consume more memory for most cases than ``FLAT`` but will be faster to run queries on, especially for large datasets.\n",
"\n",
"The following cells will show how to create an ``HNSW`` index and run queries with it using the same data as before."
]
},
{
"cell_type": "code",
"execution_count": 95,
"execution_count": 15,
"id": "865c30f3",
"metadata": {},
"outputs": [],
@ -659,7 +714,7 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 16,
"id": "347e1e70",
"metadata": {},
"outputs": [],
@ -677,15 +732,18 @@
" fields = fields,\n",
" definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)\n",
" )\n",
" # since RediSearch creates the index in the background for existing documents, we will wait until\n",
" # indexing is complete before running our queries.\n",
" while redis_client.ft(HNSW_INDEX_NAME).info()[\"indexing\"] == \"1\":\n",
" time.sleep(5)"
"\n",
"# since RediSearch creates the index in the background for existing documents, we will wait until\n",
"# indexing is complete before running our queries. Although this is not necessary for the first query,\n",
"# some queries may take longer to run if the index is not fully built. In general, Redis will perform\n",
"# best when adding new documents to existing indices rather than new indices on existing documents.\n",
"while redis_client.ft(HNSW_INDEX_NAME).info()[\"indexing\"] == \"1\":\n",
" time.sleep(5)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 17,
"id": "8e474447",
"metadata": {},
"outputs": [
@ -693,16 +751,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"0. Museum of Modern Art (Score: 0.875)\n",
"1. Renaissance art (Score: 0.864)\n",
"2. Pop art (Score: 0.861)\n",
"3. Hellenistic art (Score: 0.853)\n",
"4. Modernist literature (Score: 0.847)\n",
"5. Art film (Score: 0.843)\n",
"6. Art (Score: 0.842)\n",
"7. Byzantine art (Score: 0.841)\n",
"8. Postmodernism (Score: 0.84)\n",
"9. Cubism (Score: 0.839)\n"
"0. Western Europe (Score: 0.868)\n",
"1. Northern Europe (Score: 0.855)\n",
"2. Central Europe (Score: 0.843)\n",
"3. European (Score: 0.841)\n",
"4. Eastern Europe (Score: 0.839)\n",
"5. Europe (Score: 0.839)\n",
"6. Western European Union (Score: 0.837)\n",
"7. Southern Europe (Score: 0.831)\n",
"8. Western civilization (Score: 0.83)\n",
"9. Council of Europe (Score: 0.827)\n"
]
}
],
@ -712,7 +770,7 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 18,
"id": "cb799e69",
"metadata": {},
"outputs": [
@ -722,29 +780,29 @@
"text": [
" ----- Flat Index ----- \n",
"0. Museum of Modern Art (Score: 0.875)\n",
"1. Western Europe (Score: 0.868)\n",
"1. Western Europe (Score: 0.867)\n",
"2. Renaissance art (Score: 0.864)\n",
"3. Pop art (Score: 0.86)\n",
"3. Pop art (Score: 0.861)\n",
"4. Northern Europe (Score: 0.855)\n",
"5. Hellenistic art (Score: 0.853)\n",
"6. Modernist literature (Score: 0.847)\n",
"7. Art film (Score: 0.843)\n",
"8. Central Europe (Score: 0.843)\n",
"9. European (Score: 0.841)\n",
"Flat index query time: 0.19 seconds\n",
"9. Art (Score: 0.842)\n",
"Flat index query time: 0.263 seconds\n",
"\n",
" ----- HNSW Index ------ \n",
"0. Museum of Modern Art (Score: 0.875)\n",
"1. Renaissance art (Score: 0.864)\n",
"2. Pop art (Score: 0.861)\n",
"3. Hellenistic art (Score: 0.853)\n",
"4. Modernist literature (Score: 0.847)\n",
"5. Art film (Score: 0.843)\n",
"6. Art (Score: 0.842)\n",
"7. Byzantine art (Score: 0.841)\n",
"8. Postmodernism (Score: 0.84)\n",
"9. Cubism (Score: 0.839)\n",
"HNSW index query time: 0.134 seconds\n",
"0. Western Europe (Score: 0.867)\n",
"1. Northern Europe (Score: 0.855)\n",
"2. Central Europe (Score: 0.843)\n",
"3. European (Score: 0.841)\n",
"4. Eastern Europe (Score: 0.839)\n",
"5. Europe (Score: 0.839)\n",
"6. Western European Union (Score: 0.837)\n",
"7. Southern Europe (Score: 0.831)\n",
"8. Western civilization (Score: 0.83)\n",
"9. Council of Europe (Score: 0.827)\n",
"HNSW index query time: 0.129 seconds\n",
" ------------------------ \n"
]
}
@ -774,7 +832,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "c07161ab",
"id": "69aa7a09",
"metadata": {},
"outputs": [],
"source": []

@ -0,0 +1,46 @@
import os
import wget
import zipfile
import numpy as np
import pandas as pd
from ast import literal_eval
def download_wikipedia_data(
data_path: str = '../../data/',
download_path: str = "./",
file_name: str = "vector_database_wikipedia_articles_embedded") -> pd.DataFrame:
data_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'
csv_file_path = os.path.join(data_path, file_name + ".csv")
zip_file_path = os.path.join(download_path, file_name + ".zip")
if os.path.isfile(csv_file_path):
print("File Downloaded")
else:
if os.path.isfile(zip_file_path):
print("Zip downloaded but not unzipped, unzipping now...")
else:
print("File not found, downloading now...")
# Download the data
wget.download(data_url, out=download_path, bar=True)
# Unzip the data
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(data_path)
# Remove the zip file
os.remove('vector_database_wikipedia_articles_embedded.zip')
print(f"File downloaded to {data_path}")
def read_wikipedia_data(data_path: str = '../../data/', file_name: str = "vector_database_wikipedia_articles_embedded") -> pd.DataFrame:
csv_file_path = os.path.join(data_path, file_name + ".csv")
data = pd.read_csv(csv_file_path)
# Read vectors from strings back into a list
data['title_vector'] = data.title_vector.apply(literal_eval)
data['content_vector'] = data.content_vector.apply(literal_eval)
# Set vector_id to be a string
data['vector_id'] = data['vector_id'].apply(str)
return data

@ -0,0 +1,5 @@
port 6379
appendonly no
save ""
protected-mode no
io-threads 2
Loading…
Cancel
Save