Pushing update to remove data loading

pull/56/head
colin-openai 1 year ago
parent befe771b2c
commit 3ad0e718cb

@ -58,12 +58,15 @@
"# We'll need to install the clients for all vector databases\n",
"!pip install pinecone-client\n",
"!pip install weaviate-client\n",
"!pip install qdrant-client"
"!pip install qdrant-client\n",
"\n",
"#Install wget to pull zip file\n",
"!pip install wget"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "5be94df6",
"metadata": {},
"outputs": [],
@ -71,14 +74,12 @@
"import openai\n",
"\n",
"import tiktoken\n",
"from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
"from typing import List, Iterator\n",
"import concurrent\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"from datasets import load_dataset\n",
"import numpy as np\n",
"import os\n",
"import wget\n",
"from ast import literal_eval\n",
"\n",
"# Pinecone's client library for Python\n",
"import pinecone\n",
@ -106,285 +107,69 @@
"source": [
"## Load data\n",
"\n",
"In this section we'll source the data for this task, embed it and format it for insertion into a vector database"
"In this section we'll load embedded data that we've prepared previous to this session."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bd99e08e",
"execution_count": null,
"id": "5dff8b55",
"metadata": {},
"outputs": [],
"source": [
"# Simple function to take in a list of text objects and return them as a list of embeddings\n",
"def get_embeddings(input: List):\n",
" response = openai.Embedding.create(\n",
" input=input,\n",
" model=EMBEDDING_MODEL,\n",
" )[\"data\"]\n",
" return [data[\"embedding\"] for data in response]\n",
"\n",
"def batchify(iterable, n=1):\n",
" l = len(iterable)\n",
" for ndx in range(0, l, n):\n",
" yield iterable[ndx : min(ndx + n, l)]\n",
"\n",
"# Function for batching and parallel processing the embeddings\n",
"def embed_corpus(\n",
" corpus: List[str],\n",
" batch_size=64,\n",
" num_workers=8,\n",
" max_context_len=8191,\n",
"):\n",
"\n",
" # Encode the corpus, truncating to max_context_len\n",
" encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
" encoded_corpus = [\n",
" encoded_article[:max_context_len] for encoded_article in encoding.encode_batch(corpus)\n",
" ]\n",
"\n",
" # Calculate corpus statistics: the number of inputs, the total number of tokens, and the estimated cost to embed\n",
" num_tokens = sum(len(article) for article in encoded_corpus)\n",
" cost_to_embed_tokens = num_tokens / 1_000 * 0.0004\n",
" print(\n",
" f\"num_articles={len(encoded_corpus)}, num_tokens={num_tokens}, est_embedding_cost={cost_to_embed_tokens:.2f} USD\"\n",
" )\n",
"embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n",
"\n",
" # Embed the corpus\n",
" with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
" \n",
" futures = [\n",
" executor.submit(get_embeddings, text_batch)\n",
" for text_batch in batchify(encoded_corpus, batch_size)\n",
" ]\n",
"\n",
" with tqdm(total=len(encoded_corpus)) as pbar:\n",
" for _ in concurrent.futures.as_completed(futures):\n",
" pbar.update(batch_size)\n",
"\n",
" embeddings = []\n",
" for future in futures:\n",
" data = future.result()\n",
" embeddings.extend(data)\n",
"\n",
" return embeddings"
"# Warning, the file is pretty big so this will take some time\n",
"wget.download(embeddings_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c1c73cb",
"id": "21097972",
"metadata": {},
"outputs": [],
"source": [
"# We'll use the datasets library to pull the Simple Wikipedia dataset for embedding\n",
"dataset = list(load_dataset(\"wikipedia\", \"20220301.simple\")[\"train\"])\n",
"# Limited to 25k articles for demo purposes\n",
"dataset = dataset[:25_000] "
"import zipfile\n",
"with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n",
" zip_ref.extractall(\"../data\")\n",
" \n",
"article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e6ee90ce",
"execution_count": null,
"id": "1721e45d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"num_articles=25000, num_tokens=12896881, est_embedding_cost=5.16 USD\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"25024it [01:06, 377.31it/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 16.3 s, sys: 2.24 s, total: 18.5 s\n",
"Wall time: 1min 8s\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"outputs": [],
"source": [
"%%time\n",
"# Embed the article text\n",
"dataset_embeddings = embed_corpus([article[\"text\"] for article in dataset])"
"article_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "850c7215",
"execution_count": null,
"id": "960b82af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"num_articles=25000, num_tokens=88300, est_embedding_cost=0.04 USD\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"25024it [00:36, 683.22it/s] \n"
]
}
],
"outputs": [],
"source": [
"# Embed the article titles separately\n",
"title_embeddings = embed_corpus([article[\"title\"] for article in dataset])"
"# Read vectors from strings back into a list\n",
"#article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n",
"article_df['content_vector'] = article_df.content_vector.apply(literal_eval)\n",
"\n",
"# Set vector_id to be a string\n",
"article_df['vector_id'] = article_df['vector_id'].apply(str)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"id": "1410daaa",
"execution_count": null,
"id": "a334ab8b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>text</th>\n",
" <th>title_vector</th>\n",
" <th>content_vector</th>\n",
" <th>vector_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>https://simple.wikipedia.org/wiki/April</td>\n",
" <td>April</td>\n",
" <td>April is the fourth month of the year in the J...</td>\n",
" <td>[0.001009464613161981, -0.020700545981526375, ...</td>\n",
" <td>[-0.011253940872848034, -0.013491976074874401,...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>https://simple.wikipedia.org/wiki/August</td>\n",
" <td>August</td>\n",
" <td>August (Aug.) is the eighth month of the year ...</td>\n",
" <td>[0.0009286514250561595, 0.000820168002974242, ...</td>\n",
" <td>[0.0003609954728744924, 0.007262262050062418, ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>https://simple.wikipedia.org/wiki/Art</td>\n",
" <td>Art</td>\n",
" <td>Art is a creative activity that expresses imag...</td>\n",
" <td>[0.003393713850528002, 0.0061537534929811954, ...</td>\n",
" <td>[-0.004959689453244209, 0.015772193670272827, ...</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8</td>\n",
" <td>https://simple.wikipedia.org/wiki/A</td>\n",
" <td>A</td>\n",
" <td>A or a is the first letter of the English alph...</td>\n",
" <td>[0.0153952119871974, -0.013759135268628597, 0....</td>\n",
" <td>[0.024894846603274345, -0.022186409682035446, ...</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>https://simple.wikipedia.org/wiki/Air</td>\n",
" <td>Air</td>\n",
" <td>Air refers to the Earth's atmosphere. Air is a...</td>\n",
" <td>[0.02224554680287838, -0.02044147066771984, -0...</td>\n",
" <td>[0.021524671465158463, 0.018522677943110466, -...</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id url title \\\n",
"0 1 https://simple.wikipedia.org/wiki/April April \n",
"1 2 https://simple.wikipedia.org/wiki/August August \n",
"2 6 https://simple.wikipedia.org/wiki/Art Art \n",
"3 8 https://simple.wikipedia.org/wiki/A A \n",
"4 9 https://simple.wikipedia.org/wiki/Air Air \n",
"\n",
" text \\\n",
"0 April is the fourth month of the year in the J... \n",
"1 August (Aug.) is the eighth month of the year ... \n",
"2 Art is a creative activity that expresses imag... \n",
"3 A or a is the first letter of the English alph... \n",
"4 Air refers to the Earth's atmosphere. Air is a... \n",
"\n",
" title_vector \\\n",
"0 [0.001009464613161981, -0.020700545981526375, ... \n",
"1 [0.0009286514250561595, 0.000820168002974242, ... \n",
"2 [0.003393713850528002, 0.0061537534929811954, ... \n",
"3 [0.0153952119871974, -0.013759135268628597, 0.... \n",
"4 [0.02224554680287838, -0.02044147066771984, -0... \n",
"\n",
" content_vector vector_id \n",
"0 [-0.011253940872848034, -0.013491976074874401,... 0 \n",
"1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n",
"2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n",
"3 [0.024894846603274345, -0.022186409682035446, ... 3 \n",
"4 [0.021524671465158463, 0.018522677943110466, -... 4 "
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# We will then store the result in another dataframe, and prep the data for insertion into a vector DB\n",
"article_df = pd.DataFrame(dataset)\n",
"article_df['title_vector'] = title_embeddings\n",
"article_df['content_vector'] = dataset_embeddings\n",
"article_df['vector_id'] = article_df.index\n",
"article_df['vector_id'] = article_df['vector_id'].apply(str)\n",
"article_df.head()"
"len(article_df['title_vector'][0])"
]
},
{
@ -406,7 +191,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "92e6152a",
"metadata": {},
"outputs": [],
@ -429,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": 108,
"execution_count": null,
"id": "0a71c575",
"metadata": {},
"outputs": [],
@ -461,21 +246,10 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": null,
"id": "7ea9ad46",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['wikipedia-articles']"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Pick a name for the new index\n",
"index_name = 'wikipedia-articles'\n",
@ -494,18 +268,10 @@
},
{
"cell_type": "code",
"execution_count": 126,
"execution_count": null,
"id": "5daeba00",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Uploading vectors to content namespace..\n"
]
}
],
"outputs": [],
"source": [
"# Upsert content vectors in content namespace - this can take a few minutes\n",
"print(\"Uploading vectors to content namespace..\")\n",
@ -515,18 +281,10 @@
},
{
"cell_type": "code",
"execution_count": 127,
"execution_count": null,
"id": "5fc1b083",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Uploading vectors to title namespace..\n"
]
}
],
"outputs": [],
"source": [
"# Upsert title vectors in title namespace - this can also take a few minutes\n",
"print(\"Uploading vectors to title namespace..\")\n",
@ -536,25 +294,10 @@
},
{
"cell_type": "code",
"execution_count": 128,
"execution_count": null,
"id": "f90c7fba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'dimension': 1536,\n",
" 'index_fullness': 0.1,\n",
" 'namespaces': {'content': {'vector_count': 25000},\n",
" 'title': {'vector_count': 25000}},\n",
" 'total_vector_count': 50000}"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Check index size for each namespace to confirm all of our docs have loaded\n",
"index.describe_index_stats()"
@ -584,7 +327,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": null,
"id": "3c8c2aa1",
"metadata": {},
"outputs": [],
@ -681,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": 113,
"execution_count": null,
"id": "b9ea472d",
"metadata": {},
"outputs": [],
@ -691,21 +434,10 @@
},
{
"cell_type": "code",
"execution_count": 114,
"execution_count": null,
"id": "13be220d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'classes': []}"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"client.schema.delete_all()\n",
"client.schema.get()"
@ -713,21 +445,10 @@
},
{
"cell_type": "code",
"execution_count": 115,
"execution_count": null,
"id": "73d33184",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"client.is_ready()"
]
@ -748,53 +469,10 @@
},
{
"cell_type": "code",
"execution_count": 116,
"execution_count": null,
"id": "e868d143",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'classes': [{'class': 'Article',\n",
" 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},\n",
" 'cleanupIntervalSeconds': 60,\n",
" 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},\n",
" 'properties': [{'dataType': ['text'],\n",
" 'description': 'Title of the article',\n",
" 'name': 'title',\n",
" 'tokenization': 'word'},\n",
" {'dataType': ['text'],\n",
" 'description': 'Contents of the article',\n",
" 'name': 'content',\n",
" 'tokenization': 'word'}],\n",
" 'shardingConfig': {'virtualPerPhysical': 128,\n",
" 'desiredCount': 1,\n",
" 'actualCount': 1,\n",
" 'desiredVirtualCount': 128,\n",
" 'actualVirtualCount': 128,\n",
" 'key': '_id',\n",
" 'strategy': 'hash',\n",
" 'function': 'murmur3'},\n",
" 'vectorIndexConfig': {'skip': False,\n",
" 'cleanupIntervalSeconds': 300,\n",
" 'maxConnections': 64,\n",
" 'efConstruction': 128,\n",
" 'ef': -1,\n",
" 'dynamicEfMin': 100,\n",
" 'dynamicEfMax': 500,\n",
" 'dynamicEfFactor': 8,\n",
" 'vectorCacheMaxObjects': 2000000,\n",
" 'flatSearchCutoff': 40000,\n",
" 'distance': 'cosine'},\n",
" 'vectorIndexType': 'hnsw',\n",
" 'vectorizer': 'none'}]}"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"class_obj = {\n",
" \"class\": \"Article\",\n",
@ -820,18 +498,10 @@
},
{
"cell_type": "code",
"execution_count": 117,
"execution_count": null,
"id": "786d437f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Uploading vectors to article schema..\n"
]
}
],
"outputs": [],
"source": [
"# Convert DF into a list of tuples\n",
"data_objects = []\n",
@ -861,49 +531,10 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": null,
"id": "3658693c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Kim Jong-nam\n",
"Kim Jong-nam (May 10, 1971 - February 13, 2017) was the eldest son of Kim Jong-il, the former leader of North Korea.\n",
"\n",
"He tried to enter Japan using a fake passport in May 2001. This was to visit Disneyland. This caused his father to not approve of him. Kim Jong-nam's younger half-brother Kim Jong-un was made the heir in September 2010.\n",
"\n",
"In June 2010, Kim Jong-nam gave a brief interview to the Associated Press in Macau. He told the reporter that he had \"no plans\" to defect to Europe. The press had recently said this. Kim Jong-nam lived in an apartment on the southern tip of Macau's Coloane Island until 2007. An anonymous South Korean official reported in October 2010 that Jong-nam had not lived in Macau for \"months\", and now goes between China and \"another country.\"\n",
"\n",
"When his father died, Kim Jong-nam did not attend the funeral. This was to avoid rumours on the succession.\n",
"\n",
"He was assassinated in Malaysia on February 13, 2017, which is believed to be ordered by his half-brother Kim Jong-un.\n",
"\n",
"Personal life\n",
"The South Korean newspaper The Chosun Ilbo said that Kim Jong-nam has two wives, at least one mistress, and several children. His first wife Shin Jong-hui (born c. 1980) and their son Kum-sol (born c. 1996) live at a home called Dragon Villa on the northern outskirts of Beijing. His second wife Lee Hye-kyong (born c. 1970), their son Han-sol (born c. 1995) and their daughter Sol-hui (born c. 1998) live in an apartment building in Macau. Jong-nam's mistress, former Air Koryo flight attendant So Yong-la (born c. 1980), also lives in Macau. \n",
"\n",
"Jong-nam is often given attention by the media for his gambling and extravagant spending.\n",
"\n",
"References\n",
"\n",
"1971 births\n",
"2017 deaths\n",
"Assassinated people\n",
"North Korean politicians\n"
]
},
{
"data": {
"text/plain": [
"{'Aggregate': {'Article': [{'meta': {'count': 25000}}]}}"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Test our insert has worked by checking one object\n",
"print(client.data_object.get()['objects'][0]['properties']['title'])\n",
@ -928,7 +559,7 @@
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": null,
"id": "5acd5437",
"metadata": {},
"outputs": [],
@ -954,37 +585,10 @@
},
{
"cell_type": "code",
"execution_count": 120,
"execution_count": null,
"id": "15def653",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1. Museum of Modern Art (Score: 0.938)\n",
"2. Western Europe (Score: 0.934)\n",
"3. Renaissance art (Score: 0.932)\n",
"4. Pop art (Score: 0.93)\n",
"5. Northern Europe (Score: 0.927)\n",
"6. Hellenistic art (Score: 0.926)\n",
"7. Modernist literature (Score: 0.924)\n",
"8. Art film (Score: 0.922)\n",
"9. Central Europe (Score: 0.921)\n",
"10. Art (Score: 0.921)\n",
"11. European (Score: 0.921)\n",
"12. Byzantine art (Score: 0.92)\n",
"13. Postmodernism (Score: 0.92)\n",
"14. Eastern Europe (Score: 0.92)\n",
"15. Cubism (Score: 0.92)\n",
"16. Europe (Score: 0.919)\n",
"17. Impressionism (Score: 0.919)\n",
"18. Bauhaus (Score: 0.919)\n",
"19. Surrealism (Score: 0.919)\n",
"20. Expressionism (Score: 0.919)\n"
]
}
],
"outputs": [],
"source": [
"query_result = query_weaviate('modern art in Europe','Article')\n",
"counter = 0\n",
@ -995,37 +599,10 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": null,
"id": "93c4a696",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1. Historic Scotland (Score: 0.946)\n",
"2. First War of Scottish Independence (Score: 0.946)\n",
"3. Battle of Bannockburn (Score: 0.946)\n",
"4. Wars of Scottish Independence (Score: 0.944)\n",
"5. Second War of Scottish Independence (Score: 0.939)\n",
"6. List of Scottish monarchs (Score: 0.937)\n",
"7. Scottish Borders (Score: 0.932)\n",
"8. Braveheart (Score: 0.929)\n",
"9. John of Scotland (Score: 0.929)\n",
"10. Guardians of Scotland (Score: 0.926)\n",
"11. Holyrood Abbey (Score: 0.925)\n",
"12. Scottish (Score: 0.925)\n",
"13. Scots (Score: 0.925)\n",
"14. Robert I of Scotland (Score: 0.924)\n",
"15. Scottish people (Score: 0.924)\n",
"16. Alexander I of Scotland (Score: 0.924)\n",
"17. Edinburgh Castle (Score: 0.924)\n",
"18. Robert Burns (Score: 0.923)\n",
"19. Battle of Bosworth Field (Score: 0.922)\n",
"20. David II of Scotland (Score: 0.922)\n"
]
}
],
"outputs": [],
"source": [
"query_result = query_weaviate('Famous battles in Scottish history','Article')\n",
"counter = 0\n",
@ -1063,7 +640,7 @@
},
{
"cell_type": "code",
"execution_count": 99,
"execution_count": null,
"id": "76d697e9",
"metadata": {
"ExecuteTime": {
@ -1078,7 +655,7 @@
},
{
"cell_type": "code",
"execution_count": 100,
"execution_count": null,
"id": "1deeb539",
"metadata": {
"ExecuteTime": {
@ -1086,18 +663,7 @@
"start_time": "2023-01-18T09:29:19.727897Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"CollectionsResponse(collections=[])"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"qdrant.get_collections()"
]
@ -1116,7 +682,7 @@
},
{
"cell_type": "code",
"execution_count": 101,
"execution_count": null,
"id": "1a84ee1d",
"metadata": {
"ExecuteTime": {
@ -1131,7 +697,7 @@
},
{
"cell_type": "code",
"execution_count": 102,
"execution_count": null,
"id": "00876f92",
"metadata": {
"ExecuteTime": {

Loading…
Cancel
Save