Merge pull request #121 from openai/colin

Fixed commented out vector
pull/125/head
colin-openai 1 year ago committed by GitHub
commit c31fe72f1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -66,10 +66,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "5be94df6",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<frozen importlib._bootstrap>:914: ImportWarning: _SixMetaPathImporter.find_spec() not found; falling back to find_module()\n"
]
}
],
"source": [
"import openai\n",
"\n",
@ -132,30 +140,158 @@
"source": [
"import zipfile\n",
"with zipfile.ZipFile(\"vector_database_wikipedia_articles_embedded.zip\",\"r\") as zip_ref:\n",
" zip_ref.extractall(\"../data\")\n",
" \n",
" zip_ref.extractall(\"../data\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "70bbd8ba",
"metadata": {},
"outputs": [],
"source": [
"article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "1721e45d",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>url</th>\n",
" <th>title</th>\n",
" <th>text</th>\n",
" <th>title_vector</th>\n",
" <th>content_vector</th>\n",
" <th>vector_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>https://simple.wikipedia.org/wiki/April</td>\n",
" <td>April</td>\n",
" <td>April is the fourth month of the year in the J...</td>\n",
" <td>[0.001009464613161981, -0.020700545981526375, ...</td>\n",
" <td>[-0.011253940872848034, -0.013491976074874401,...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>https://simple.wikipedia.org/wiki/August</td>\n",
" <td>August</td>\n",
" <td>August (Aug.) is the eighth month of the year ...</td>\n",
" <td>[0.0009286514250561595, 0.000820168002974242, ...</td>\n",
" <td>[0.0003609954728744924, 0.007262262050062418, ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>https://simple.wikipedia.org/wiki/Art</td>\n",
" <td>Art</td>\n",
" <td>Art is a creative activity that expresses imag...</td>\n",
" <td>[0.003393713850528002, 0.0061537534929811954, ...</td>\n",
" <td>[-0.004959689453244209, 0.015772193670272827, ...</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8</td>\n",
" <td>https://simple.wikipedia.org/wiki/A</td>\n",
" <td>A</td>\n",
" <td>A or a is the first letter of the English alph...</td>\n",
" <td>[0.0153952119871974, -0.013759135268628597, 0....</td>\n",
" <td>[0.024894846603274345, -0.022186409682035446, ...</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>https://simple.wikipedia.org/wiki/Air</td>\n",
" <td>Air</td>\n",
" <td>Air refers to the Earth's atmosphere. Air is a...</td>\n",
" <td>[0.02224554680287838, -0.02044147066771984, -0...</td>\n",
" <td>[0.021524671465158463, 0.018522677943110466, -...</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id url title \\\n",
"0 1 https://simple.wikipedia.org/wiki/April April \n",
"1 2 https://simple.wikipedia.org/wiki/August August \n",
"2 6 https://simple.wikipedia.org/wiki/Art Art \n",
"3 8 https://simple.wikipedia.org/wiki/A A \n",
"4 9 https://simple.wikipedia.org/wiki/Air Air \n",
"\n",
" text \\\n",
"0 April is the fourth month of the year in the J... \n",
"1 August (Aug.) is the eighth month of the year ... \n",
"2 Art is a creative activity that expresses imag... \n",
"3 A or a is the first letter of the English alph... \n",
"4 Air refers to the Earth's atmosphere. Air is a... \n",
"\n",
" title_vector \\\n",
"0 [0.001009464613161981, -0.020700545981526375, ... \n",
"1 [0.0009286514250561595, 0.000820168002974242, ... \n",
"2 [0.003393713850528002, 0.0061537534929811954, ... \n",
"3 [0.0153952119871974, -0.013759135268628597, 0.... \n",
"4 [0.02224554680287838, -0.02044147066771984, -0... \n",
"\n",
" content_vector vector_id \n",
"0 [-0.011253940872848034, -0.013491976074874401,... 0 \n",
"1 [0.0003609954728744924, 0.007262262050062418, ... 1 \n",
"2 [-0.004959689453244209, 0.015772193670272827, ... 2 \n",
"3 [0.024894846603274345, -0.022186409682035446, ... 3 \n",
"4 [0.021524671465158463, 0.018522677943110466, -... 4 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"article_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "960b82af",
"metadata": {},
"outputs": [],
"source": [
"# Read vectors from strings back into a list\n",
"#article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n",
"article_df['title_vector'] = article_df.title_vector.apply(literal_eval)\n",
"article_df['content_vector'] = article_df.content_vector.apply(literal_eval)\n",
"\n",
"# Set vector_id to be a string\n",
@ -164,10 +300,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "a334ab8b",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"34471"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(article_df['title_vector'][0])"
]

Loading…
Cancel
Save