mirror of
https://github.com/openai/openai-cookbook
synced 2024-11-04 06:00:33 +00:00
Pushing update to remove data loading
This commit is contained in:
parent
befe771b2c
commit
3ad0e718cb
@ -58,12 +58,15 @@
|
||||
"# We'll need to install the clients for all vector databases\n",
|
||||
"!pip install pinecone-client\n",
|
||||
"!pip install weaviate-client\n",
|
||||
"!pip install qdrant-client"
|
||||
"!pip install qdrant-client\n",
|
||||
"\n",
|
||||
"#Install wget to pull zip file\n",
|
||||
"!pip install wget"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"id": "5be94df6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -71,14 +74,12 @@
|
||||
"import openai\n",
|
||||
"\n",
|
||||
"import tiktoken\n",
|
||||
"from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
|
||||
"from typing import List, Iterator\n",
|
||||
"import concurrent\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"import wget\n",
|
||||
"from ast import literal_eval\n",
|
||||
"\n",
|
||||
"# Pinecone's client library for Python\n",
|
||||
"import pinecone\n",
|
||||
@ -106,287 +107,71 @@
|
||||
"source": [
|
||||
"## Load data\n",
|
||||
"\n",
|
||||
"In this section we'll source the data for this task, embed it and format it for insertion into a vector database"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "bd99e08e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Simple function to take in a list of text objects and return them as a list of embeddings\n",
|
||||
"def get_embeddings(input: List):\n",
|
||||
" response = openai.Embedding.create(\n",
|
||||
" input=input,\n",
|
||||
" model=EMBEDDING_MODEL,\n",
|
||||
" )[\"data\"]\n",
|
||||
" return [data[\"embedding\"] for data in response]\n",
|
||||
"\n",
|
||||
"def batchify(iterable, n=1):\n",
|
||||
" l = len(iterable)\n",
|
||||
" for ndx in range(0, l, n):\n",
|
||||
" yield iterable[ndx : min(ndx + n, l)]\n",
|
||||
"\n",
|
||||
"# Function for batching and parallel processing the embeddings\n",
|
||||
"def embed_corpus(\n",
|
||||
" corpus: List[str],\n",
|
||||
" batch_size=64,\n",
|
||||
" num_workers=8,\n",
|
||||
" max_context_len=8191,\n",
|
||||
"):\n",
|
||||
"\n",
|
||||
" # Encode the corpus, truncating to max_context_len\n",
|
||||
" encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
|
||||
" encoded_corpus = [\n",
|
||||
" encoded_article[:max_context_len] for encoded_article in encoding.encode_batch(corpus)\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # Calculate corpus statistics: the number of inputs, the total number of tokens, and the estimated cost to embed\n",
|
||||
" num_tokens = sum(len(article) for article in encoded_corpus)\n",
|
||||
" cost_to_embed_tokens = num_tokens / 1_000 * 0.0004\n",
|
||||
" print(\n",
|
||||
" f\"num_articles={len(encoded_corpus)}, num_tokens={num_tokens}, est_embedding_cost={cost_to_embed_tokens:.2f} USD\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Embed the corpus\n",
|
||||
" with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:\n",
|
||||
" \n",
|
||||
" futures = [\n",
|
||||
" executor.submit(get_embeddings, text_batch)\n",
|
||||
" for text_batch in batchify(encoded_corpus, batch_size)\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" with tqdm(total=len(encoded_corpus)) as pbar:\n",
|
||||
" for _ in concurrent.futures.as_completed(futures):\n",
|
||||
" pbar.update(batch_size)\n",
|
||||
"\n",
|
||||
" embeddings = []\n",
|
||||
" for future in futures:\n",
|
||||
" data = future.result()\n",
|
||||
" embeddings.extend(data)\n",
|
||||
"\n",
|
||||
" return embeddings"
|
||||
"In this section we'll load embedded data that we've prepared previous to this session."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0c1c73cb",
|
||||
"id": "5dff8b55",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We'll use the datasets library to pull the Simple Wikipedia dataset for embedding\n",
|
||||
"dataset = list(load_dataset(\"wikipedia\", \"20220301.simple\")[\"train\"])\n",
|
||||
"# Limited to 25k articles for demo purposes\n",
|
||||
"dataset = dataset[:25_000] "
|
||||
"embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'\n",
|
||||
"\n",
|
||||
"# Warning, the file is pretty big so this will take some time\n",
|
||||
"wget.download(embeddings_url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "e6ee90ce",
|
||||
"execution_count": null,
|
||||
"id": "21097972",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"num_articles=25000, num_tokens=12896881, est_embedding_cost=5.16 USD\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"25024it [01:06, 377.31it/s] "
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 16.3 s, sys: 2.24 s, total: 18.5 s\n",
|
||||
"Wall time: 1min 8s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"# Embed the article text\n",
|
||||
"dataset_embeddings = embed_corpus([article[\"text\"] for article in dataset])"
|
||||
"import zipfile\n",
|
||||
"with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n",
|
||||
" zip_ref.extractall(\"../data\")\n",
|
||||
" \n",
|
||||
"article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "850c7215",
|
||||
"execution_count": null,
|
||||
"id": "1721e45d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"num_articles=25000, num_tokens=88300, est_embedding_cost=0.04 USD\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"25024it [00:36, 683.22it/s] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Embed the article titles separately\n",
|
||||
"title_embeddings = embed_corpus([article[\"title\"] for article in dataset])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 122,
|
||||
"id": "1410daaa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>url</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>text</th>\n",
|
||||
" <th>title_vector</th>\n",
|
||||
" <th>content_vector</th>\n",
|
||||
" <th>vector_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>https://simple.wikipedia.org/wiki/April</td>\n",
|
||||
" <td>April</td>\n",
|
||||
" <td>April is the fourth month of the year in the J...</td>\n",
|
||||
" <td>[0.001009464613161981, -0.020700545981526375, ...</td>\n",
|
||||
" <td>[-0.011253940872848034, -0.013491976074874401,...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>https://simple.wikipedia.org/wiki/August</td>\n",
|
||||
" <td>August</td>\n",
|
||||
" <td>August (Aug.) is the eighth month of the year ...</td>\n",
|
||||
" <td>[0.0009286514250561595, 0.000820168002974242, ...</td>\n",
|
||||
" <td>[0.0003609954728744924, 0.007262262050062418, ...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>https://simple.wikipedia.org/wiki/Art</td>\n",
|
||||
" <td>Art</td>\n",
|
||||
" <td>Art is a creative activity that expresses imag...</td>\n",
|
||||
" <td>[0.003393713850528002, 0.0061537534929811954, ...</td>\n",
|
||||
" <td>[-0.004959689453244209, 0.015772193670272827, ...</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>https://simple.wikipedia.org/wiki/A</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" <td>A or a is the first letter of the English alph...</td>\n",
|
||||
" <td>[0.0153952119871974, -0.013759135268628597, 0....</td>\n",
|
||||
" <td>[0.024894846603274345, -0.022186409682035446, ...</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>9</td>\n",
|
||||
" <td>https://simple.wikipedia.org/wiki/Air</td>\n",
|
||||
" <td>Air</td>\n",
|
||||
" <td>Air refers to the Earth's atmosphere. Air is a...</td>\n",
|
||||
" <td>[0.02224554680287838, -0.02044147066771984, -0...</td>\n",
|
||||
" <td>[0.021524671465158463, 0.018522677943110466, -...</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" id url title \\\n",
|
||||
"0 1 https://simple.wikipedia.org/wiki/April April \n",
|
||||
"1 2 https://simple.wikipedia.org/wiki/August August \n",
|
||||
"2 6 https://simple.wikipedia.org/wiki/Art Art \n",
|
||||
"3 8 https://simple.wikipedia.org/wiki/A A \n",
|
||||
"4 9 https://simple.wikipedia.org/wiki/Air Air \n",
|
||||
"\n",
|
||||
" text \\\n",
|
||||
"0 April is the fourth month of the year in the J... \n",
|
||||
"1 August (Aug.) is the eighth month of the year ... \n",
|
||||
"2 Art is a creative activity that expresses imag... \n",
|
||||
"3 A or a is the first letter of the English alph... \n",
|
||||
"4 Air refers to the Earth's atmosphere. Air is a... \n",
|
||||
"\n",
|
||||
" title_vector \\\n",
|
||||
"0 [0.001009464613161981, -0.020700545981526375, ... \n",
|
||||
"1 [0.0009286514250561595, 0.000820168002974242, ... \n",
|
||||
"2 [0.003393713850528002, 0.0061537534929811954, ... \n",
|
||||
"3 [0.0153952119871974, -0.013759135268628597, 0.... \n",
|
||||
"4 [0.02224554680287838, -0.02044147066771984, -0... \n",
|
||||
"\n",
|
||||
" content_vector vector_id \n",
|
||||
"0 [-0.011253940872848034, -0.013491976074874401,... 0 \n",
|
||||
"1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n",
|
||||
"2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n",
|
||||
"3 [0.024894846603274345, -0.022186409682035446, ... 3 \n",
|
||||
"4 [0.021524671465158463, 0.018522677943110466, -... 4 "
|
||||
]
|
||||
},
|
||||
"execution_count": 122,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We will then store the result in another dataframe, and prep the data for insertion into a vector DB\n",
|
||||
"article_df = pd.DataFrame(dataset)\n",
|
||||
"article_df['title_vector'] = title_embeddings\n",
|
||||
"article_df['content_vector'] = dataset_embeddings\n",
|
||||
"article_df['vector_id'] = article_df.index\n",
|
||||
"article_df['vector_id'] = article_df['vector_id'].apply(str)\n",
|
||||
"article_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "960b82af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Read vectors from strings back into a list\n",
|
||||
"#article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n",
|
||||
"article_df['content_vector'] = article_df.content_vector.apply(literal_eval)\n",
|
||||
"\n",
|
||||
"# Set vector_id to be a string\n",
|
||||
"article_df['vector_id'] = article_df['vector_id'].apply(str)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a334ab8b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(article_df['title_vector'][0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed32fc87",
|
||||
@ -406,7 +191,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"id": "92e6152a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -429,7 +214,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 108,
|
||||
"execution_count": null,
|
||||
"id": "0a71c575",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -461,21 +246,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 124,
|
||||
"execution_count": null,
|
||||
"id": "7ea9ad46",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['wikipedia-articles']"
|
||||
]
|
||||
},
|
||||
"execution_count": 124,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Pick a name for the new index\n",
|
||||
"index_name = 'wikipedia-articles'\n",
|
||||
@ -494,18 +268,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 126,
|
||||
"execution_count": null,
|
||||
"id": "5daeba00",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Uploading vectors to content namespace..\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upsert content vectors in content namespace - this can take a few minutes\n",
|
||||
"print(\"Uploading vectors to content namespace..\")\n",
|
||||
@ -515,18 +281,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 127,
|
||||
"execution_count": null,
|
||||
"id": "5fc1b083",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Uploading vectors to title namespace..\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upsert title vectors in title namespace - this can also take a few minutes\n",
|
||||
"print(\"Uploading vectors to title namespace..\")\n",
|
||||
@ -536,25 +294,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 128,
|
||||
"execution_count": null,
|
||||
"id": "f90c7fba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'dimension': 1536,\n",
|
||||
" 'index_fullness': 0.1,\n",
|
||||
" 'namespaces': {'content': {'vector_count': 25000},\n",
|
||||
" 'title': {'vector_count': 25000}},\n",
|
||||
" 'total_vector_count': 50000}"
|
||||
]
|
||||
},
|
||||
"execution_count": 128,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check index size for each namespace to confirm all of our docs have loaded\n",
|
||||
"index.describe_index_stats()"
|
||||
@ -584,7 +327,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"execution_count": null,
|
||||
"id": "3c8c2aa1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -681,7 +424,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 113,
|
||||
"execution_count": null,
|
||||
"id": "b9ea472d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -691,21 +434,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 114,
|
||||
"execution_count": null,
|
||||
"id": "13be220d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'classes': []}"
|
||||
]
|
||||
},
|
||||
"execution_count": 114,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client.schema.delete_all()\n",
|
||||
"client.schema.get()"
|
||||
@ -713,21 +445,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 115,
|
||||
"execution_count": null,
|
||||
"id": "73d33184",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 115,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"client.is_ready()"
|
||||
]
|
||||
@ -748,53 +469,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 116,
|
||||
"execution_count": null,
|
||||
"id": "e868d143",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'classes': [{'class': 'Article',\n",
|
||||
" 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},\n",
|
||||
" 'cleanupIntervalSeconds': 60,\n",
|
||||
" 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},\n",
|
||||
" 'properties': [{'dataType': ['text'],\n",
|
||||
" 'description': 'Title of the article',\n",
|
||||
" 'name': 'title',\n",
|
||||
" 'tokenization': 'word'},\n",
|
||||
" {'dataType': ['text'],\n",
|
||||
" 'description': 'Contents of the article',\n",
|
||||
" 'name': 'content',\n",
|
||||
" 'tokenization': 'word'}],\n",
|
||||
" 'shardingConfig': {'virtualPerPhysical': 128,\n",
|
||||
" 'desiredCount': 1,\n",
|
||||
" 'actualCount': 1,\n",
|
||||
" 'desiredVirtualCount': 128,\n",
|
||||
" 'actualVirtualCount': 128,\n",
|
||||
" 'key': '_id',\n",
|
||||
" 'strategy': 'hash',\n",
|
||||
" 'function': 'murmur3'},\n",
|
||||
" 'vectorIndexConfig': {'skip': False,\n",
|
||||
" 'cleanupIntervalSeconds': 300,\n",
|
||||
" 'maxConnections': 64,\n",
|
||||
" 'efConstruction': 128,\n",
|
||||
" 'ef': -1,\n",
|
||||
" 'dynamicEfMin': 100,\n",
|
||||
" 'dynamicEfMax': 500,\n",
|
||||
" 'dynamicEfFactor': 8,\n",
|
||||
" 'vectorCacheMaxObjects': 2000000,\n",
|
||||
" 'flatSearchCutoff': 40000,\n",
|
||||
" 'distance': 'cosine'},\n",
|
||||
" 'vectorIndexType': 'hnsw',\n",
|
||||
" 'vectorizer': 'none'}]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 116,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class_obj = {\n",
|
||||
" \"class\": \"Article\",\n",
|
||||
@ -820,18 +498,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 117,
|
||||
"execution_count": null,
|
||||
"id": "786d437f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Uploading vectors to article schema..\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Convert DF into a list of tuples\n",
|
||||
"data_objects = []\n",
|
||||
@ -861,49 +531,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 118,
|
||||
"execution_count": null,
|
||||
"id": "3658693c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Kim Jong-nam\n",
|
||||
"Kim Jong-nam (May 10, 1971 - February 13, 2017) was the eldest son of Kim Jong-il, the former leader of North Korea.\n",
|
||||
"\n",
|
||||
"He tried to enter Japan using a fake passport in May 2001. This was to visit Disneyland. This caused his father to not approve of him. Kim Jong-nam's younger half-brother Kim Jong-un was made the heir in September 2010.\n",
|
||||
"\n",
|
||||
"In June 2010, Kim Jong-nam gave a brief interview to the Associated Press in Macau. He told the reporter that he had \"no plans\" to defect to Europe. The press had recently said this. Kim Jong-nam lived in an apartment on the southern tip of Macau's Coloane Island until 2007. An anonymous South Korean official reported in October 2010 that Jong-nam had not lived in Macau for \"months\", and now goes between China and \"another country.\"\n",
|
||||
"\n",
|
||||
"When his father died, Kim Jong-nam did not attend the funeral. This was to avoid rumours on the succession.\n",
|
||||
"\n",
|
||||
"He was assassinated in Malaysia on February 13, 2017, which is believed to be ordered by his half-brother Kim Jong-un.\n",
|
||||
"\n",
|
||||
"Personal life\n",
|
||||
"The South Korean newspaper The Chosun Ilbo said that Kim Jong-nam has two wives, at least one mistress, and several children. His first wife Shin Jong-hui (born c. 1980) and their son Kum-sol (born c. 1996) live at a home called Dragon Villa on the northern outskirts of Beijing. His second wife Lee Hye-kyong (born c. 1970), their son Han-sol (born c. 1995) and their daughter Sol-hui (born c. 1998) live in an apartment building in Macau. Jong-nam's mistress, former Air Koryo flight attendant So Yong-la (born c. 1980), also lives in Macau. \n",
|
||||
"\n",
|
||||
"Jong-nam is often given attention by the media for his gambling and extravagant spending.\n",
|
||||
"\n",
|
||||
"References\n",
|
||||
"\n",
|
||||
"1971 births\n",
|
||||
"2017 deaths\n",
|
||||
"Assassinated people\n",
|
||||
"North Korean politicians\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Aggregate': {'Article': [{'meta': {'count': 25000}}]}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 118,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test our insert has worked by checking one object\n",
|
||||
"print(client.data_object.get()['objects'][0]['properties']['title'])\n",
|
||||
@ -928,7 +559,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 119,
|
||||
"execution_count": null,
|
||||
"id": "5acd5437",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -954,37 +585,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 120,
|
||||
"execution_count": null,
|
||||
"id": "15def653",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1. Museum of Modern Art (Score: 0.938)\n",
|
||||
"2. Western Europe (Score: 0.934)\n",
|
||||
"3. Renaissance art (Score: 0.932)\n",
|
||||
"4. Pop art (Score: 0.93)\n",
|
||||
"5. Northern Europe (Score: 0.927)\n",
|
||||
"6. Hellenistic art (Score: 0.926)\n",
|
||||
"7. Modernist literature (Score: 0.924)\n",
|
||||
"8. Art film (Score: 0.922)\n",
|
||||
"9. Central Europe (Score: 0.921)\n",
|
||||
"10. Art (Score: 0.921)\n",
|
||||
"11. European (Score: 0.921)\n",
|
||||
"12. Byzantine art (Score: 0.92)\n",
|
||||
"13. Postmodernism (Score: 0.92)\n",
|
||||
"14. Eastern Europe (Score: 0.92)\n",
|
||||
"15. Cubism (Score: 0.92)\n",
|
||||
"16. Europe (Score: 0.919)\n",
|
||||
"17. Impressionism (Score: 0.919)\n",
|
||||
"18. Bauhaus (Score: 0.919)\n",
|
||||
"19. Surrealism (Score: 0.919)\n",
|
||||
"20. Expressionism (Score: 0.919)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_result = query_weaviate('modern art in Europe','Article')\n",
|
||||
"counter = 0\n",
|
||||
@ -995,37 +599,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 85,
|
||||
"execution_count": null,
|
||||
"id": "93c4a696",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1. Historic Scotland (Score: 0.946)\n",
|
||||
"2. First War of Scottish Independence (Score: 0.946)\n",
|
||||
"3. Battle of Bannockburn (Score: 0.946)\n",
|
||||
"4. Wars of Scottish Independence (Score: 0.944)\n",
|
||||
"5. Second War of Scottish Independence (Score: 0.939)\n",
|
||||
"6. List of Scottish monarchs (Score: 0.937)\n",
|
||||
"7. Scottish Borders (Score: 0.932)\n",
|
||||
"8. Braveheart (Score: 0.929)\n",
|
||||
"9. John of Scotland (Score: 0.929)\n",
|
||||
"10. Guardians of Scotland (Score: 0.926)\n",
|
||||
"11. Holyrood Abbey (Score: 0.925)\n",
|
||||
"12. Scottish (Score: 0.925)\n",
|
||||
"13. Scots (Score: 0.925)\n",
|
||||
"14. Robert I of Scotland (Score: 0.924)\n",
|
||||
"15. Scottish people (Score: 0.924)\n",
|
||||
"16. Alexander I of Scotland (Score: 0.924)\n",
|
||||
"17. Edinburgh Castle (Score: 0.924)\n",
|
||||
"18. Robert Burns (Score: 0.923)\n",
|
||||
"19. Battle of Bosworth Field (Score: 0.922)\n",
|
||||
"20. David II of Scotland (Score: 0.922)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_result = query_weaviate('Famous battles in Scottish history','Article')\n",
|
||||
"counter = 0\n",
|
||||
@ -1063,7 +640,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 99,
|
||||
"execution_count": null,
|
||||
"id": "76d697e9",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@ -1078,7 +655,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 100,
|
||||
"execution_count": null,
|
||||
"id": "1deeb539",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@ -1086,18 +663,7 @@
|
||||
"start_time": "2023-01-18T09:29:19.727897Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"CollectionsResponse(collections=[])"
|
||||
]
|
||||
},
|
||||
"execution_count": 100,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"qdrant.get_collections()"
|
||||
]
|
||||
@ -1116,7 +682,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 101,
|
||||
"execution_count": null,
|
||||
"id": "1a84ee1d",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@ -1131,7 +697,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 102,
|
||||
"execution_count": null,
|
||||
"id": "00876f92",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
|
Loading…
Reference in New Issue
Block a user