Pushing updated version with comments fixed

pull/1077/head
Colin Jarvis 1 year ago
parent da802af763
commit dfbe09f106

@ -633,14 +633,7 @@
" 87% [................................................................ ] 3915776 / 4470649\r", " 87% [................................................................ ] 3915776 / 4470649\r",
" 87% [................................................................ ] 3923968 / 4470649\r", " 87% [................................................................ ] 3923968 / 4470649\r",
" 87% [................................................................. ] 3932160 / 4470649\r", " 87% [................................................................. ] 3932160 / 4470649\r",
" 88% [................................................................. ] 3940352 / 4470649" " 88% [................................................................. ] 3940352 / 4470649\r",
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
" 88% [................................................................. ] 3948544 / 4470649\r", " 88% [................................................................. ] 3948544 / 4470649\r",
" 88% [................................................................. ] 3956736 / 4470649\r", " 88% [................................................................. ] 3956736 / 4470649\r",
" 88% [................................................................. ] 3964928 / 4470649\r", " 88% [................................................................. ] 3964928 / 4470649\r",
@ -660,7 +653,14 @@
" 91% [................................................................... ] 4079616 / 4470649\r", " 91% [................................................................... ] 4079616 / 4470649\r",
" 91% [................................................................... ] 4087808 / 4470649\r", " 91% [................................................................... ] 4087808 / 4470649\r",
" 91% [................................................................... ] 4096000 / 4470649\r", " 91% [................................................................... ] 4096000 / 4470649\r",
" 91% [................................................................... ] 4104192 / 4470649\r", " 91% [................................................................... ] 4104192 / 4470649"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
" 91% [.................................................................... ] 4112384 / 4470649\r", " 91% [.................................................................... ] 4112384 / 4470649\r",
" 92% [.................................................................... ] 4120576 / 4470649\r", " 92% [.................................................................... ] 4120576 / 4470649\r",
" 92% [.................................................................... ] 4128768 / 4470649\r", " 92% [.................................................................... ] 4128768 / 4470649\r",
@ -711,7 +711,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'wikipedia_articles_2000 (1).csv'" "'wikipedia_articles_2000 (2).csv'"
] ]
}, },
"execution_count": 5, "execution_count": 5,
@ -1084,7 +1084,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 12,
"id": "eb5848f3", "id": "4f4cf064",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1155,36 +1155,13 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"CPU times: user 1.08 s, sys: 135 ms, total: 1.21 s\n", "CPU times: user 1.04 s, sys: 131 ms, total: 1.17 s\n",
"Wall time: 1.3 s\n" "Wall time: 1.2 s\n"
] ]
},
{
"data": {
"text/plain": [
"0 None\n",
"1 None\n",
"2 None\n",
"3 None\n",
"4 None\n",
" ... \n",
"1995 None\n",
"1996 None\n",
"1997 None\n",
"1998 None\n",
"1999 None\n",
"Length: 2000, dtype: object"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"%%time\n", "%%time\n",
"# This step takes about 13 minutes\n",
"\n",
"# Initialise tokenizer\n", "# Initialise tokenizer\n",
"tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n", "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
"\n", "\n",
@ -1192,13 +1169,13 @@
"text_list = []\n", "text_list = []\n",
"\n", "\n",
"# Process each PDF file and prepare for embedding\n", "# Process each PDF file and prepare for embedding\n",
"article_df.apply(lambda x: chunk_text(x, text_list),axis = 1)" "x = article_df.apply(lambda x: chunk_text(x, text_list),axis = 1)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 14,
"id": "e3a6fb12", "id": "6f259e2d",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1223,7 +1200,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 15,
"id": "83f0231c", "id": "4fd503a0",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -1237,19 +1214,19 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2752it [00:11, 244.58it/s] \n" "2752it [00:10, 271.48it/s] \n"
] ]
} }
], ],
"source": [ "source": [
"# Batch embed our chunked text\n", "# Batch embed our chunked text - this will cost you about $0.50\n",
"embeddings = embed_corpus([text[\"metadata\"]['content'] for text in text_list])" "embeddings = embed_corpus([text[\"metadata\"]['content'] for text in text_list])"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 16,
"id": "61c204c8", "id": "49f75d78",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2279,17 +2256,9 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 17,
"id": "b218a207", "id": "932818e9",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████| 27/27 [00:07<00:00, 3.49it/s]\n"
]
}
],
"source": [ "source": [
"# Create a Redis pipeline to load all the vectors and their metadata\n", "# Create a Redis pipeline to load all the vectors and their metadata\n",
"def load_vectors(client:r, input_list, vector_field_name):\n", "def load_vectors(client:r, input_list, vector_field_name):\n",
@ -2307,8 +2276,24 @@
" # HSET\n", " # HSET\n",
" p.hset(key,mapping=item_metadata)\n", " p.hset(key,mapping=item_metadata)\n",
" \n", " \n",
" p.execute()\n", " p.execute()"
"\n", ]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b218a207",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████| 27/27 [00:07<00:00, 3.40it/s]\n"
]
}
],
"source": [
"batch_size = 100 # how many vectors we insert at once\n", "batch_size = 100 # how many vectors we insert at once\n",
"\n", "\n",
"for i in tqdm(range(0, len(text_list), batch_size)):\n", "for i in tqdm(range(0, len(text_list), batch_size)):\n",
@ -2321,7 +2306,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 19,
"id": "d3466f7d", "id": "d3466f7d",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2331,7 +2316,7 @@
"'2693'" "'2693'"
] ]
}, },
"execution_count": 18, "execution_count": 19,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2403,7 +2388,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 20,
"id": "89da0c45", "id": "89da0c45",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2453,7 +2438,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 21,
"id": "f0161a54", "id": "f0161a54",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2461,8 +2446,8 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"CPU times: user 7.6 ms, sys: 2.34 ms, total: 9.95 ms\n", "CPU times: user 7.1 ms, sys: 2.35 ms, total: 9.45 ms\n",
"Wall time: 288 ms\n" "Wall time: 495 ms\n"
] ]
}, },
{ {
@ -2500,7 +2485,7 @@
" <td>https://simple.wikipedia.org/wiki/Thomas%20Dolby</td>\n", " <td>https://simple.wikipedia.org/wiki/Thomas%20Dolby</td>\n",
" <td>Thomas Dolby</td>\n", " <td>Thomas Dolby</td>\n",
" <td>Title: Thomas Dolby;\\nThomas Dolby (born Thomas Morgan Robertson; 14 October 1958) is a British musican and computer designer. He is probably most famous for his 1982 hit, \"She Blinded me with Science\".\\n\\nHe married actress Kathleen Beller in 1988. The couple have three children together.\\n\\nDiscography\\n\\nSingles\\n\\nA Track did not chart in North America until 1983, after the success of \"She Blinded Me With Science\".\\n\\nAlbums\\n\\nStudio albums\\n\\nEPs\\n\\nReferences\\n\\nEnglish musicians\\nLiving people\\n1958 births\\nNew wave musicians\\nWarner Bros. Records artists</td>\n", " <td>Title: Thomas Dolby;\\nThomas Dolby (born Thomas Morgan Robertson; 14 October 1958) is a British musican and computer designer. He is probably most famous for his 1982 hit, \"She Blinded me with Science\".\\n\\nHe married actress Kathleen Beller in 1988. The couple have three children together.\\n\\nDiscography\\n\\nSingles\\n\\nA Track did not chart in North America until 1983, after the success of \"She Blinded Me With Science\".\\n\\nAlbums\\n\\nStudio albums\\n\\nEPs\\n\\nReferences\\n\\nEnglish musicians\\nLiving people\\n1958 births\\nNew wave musicians\\nWarner Bros. Records artists</td>\n",
" <td>0.132752358913</td>\n", " <td>0.132723689079</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
@ -2508,7 +2493,7 @@
" <td>https://simple.wikipedia.org/wiki/Synthesizer</td>\n", " <td>https://simple.wikipedia.org/wiki/Synthesizer</td>\n",
" <td>Synthesizer</td>\n", " <td>Synthesizer</td>\n",
" <td>Title: Synthesizer;\\nAudio technology</td>\n", " <td>Title: Synthesizer;\\nAudio technology</td>\n",
" <td>0.223150134087</td>\n", " <td>0.223129153252</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
@ -2524,11 +2509,11 @@
"1 Title: Synthesizer;\\nAudio technology \n", "1 Title: Synthesizer;\\nAudio technology \n",
"\n", "\n",
" certainty \n", " certainty \n",
"0 0.132752358913 \n", "0 0.132723689079 \n",
"1 0.223150134087 " "1 0.223129153252 "
] ]
}, },
"execution_count": 20, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2544,7 +2529,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 22,
"id": "48d136b0", "id": "48d136b0",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2577,7 +2562,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 23,
"id": "06f6e6ed", "id": "06f6e6ed",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2585,7 +2570,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Thomas Dolby is known for being a British musician and computer designer. He is most famous for his 1982 hit, \"She Blinded me with Science\".\n" "Thomas Dolby is known for his music, particularly his 1982 hit \"She Blinded Me With Science\". He is also a computer designer.\n"
] ]
} }
], ],
@ -2632,7 +2617,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 24,
"id": "0ccca3da", "id": "0ccca3da",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2649,8 +2634,8 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 25,
"id": "ba73ffab", "id": "63455a8e",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -2661,7 +2646,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 26,
"id": "68a0b8dd", "id": "68a0b8dd",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2683,7 +2668,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 27,
"id": "39e101ee", "id": "39e101ee",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2716,7 +2701,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 28,
"id": "a2b9f271", "id": "a2b9f271",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2770,7 +2755,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 29,
"id": "454c3ca9", "id": "454c3ca9",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2790,7 +2775,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 30,
"id": "34de07d2", "id": "34de07d2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2813,7 +2798,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 30, "execution_count": 31,
"id": "d3603d58", "id": "d3603d58",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2823,7 +2808,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 31, "execution_count": 32,
"id": "6bfb594b", "id": "6bfb594b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2838,8 +2823,8 @@
"Action: Search\n", "Action: Search\n",
"Action Input: \"What is Thomas Dolby known for?\"\u001b[0m\n", "Action Input: \"What is Thomas Dolby known for?\"\u001b[0m\n",
"\n", "\n",
"Observation:\u001b[36;1m\u001b[1;3mThomas Dolby is known for his music career as a British musician and his hit song \"She Blinded me with Science\". He is also a computer designer and has released several albums and singles over the years.\u001b[0m\u001b[32;1m\u001b[1;3mNow I have a better understanding of who Thomas Dolby is and what he is known for.\n", "Observation:\u001b[36;1m\u001b[1;3mThomas Dolby is known for being a British musician and computer designer, and his 1982 hit \"She Blinded Me With Science\".\u001b[0m\u001b[32;1m\u001b[1;3mNow that I know who Thomas Dolby is, I can answer the question.\n",
"Final Answer: Thomas Dolby is known for his music career as a British musician and his hit song \"She Blinded me with Science\", as well as being a computer designer and releasing several albums and singles over the years.\u001b[0m\n", "Final Answer: Thomas Dolby is known for being a British musician and computer designer, and his 1982 hit \"She Blinded Me With Science\".\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n" "\u001b[1m> Finished chain.\u001b[0m\n"
] ]
@ -2847,10 +2832,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'Thomas Dolby is known for his music career as a British musician and his hit song \"She Blinded me with Science\", as well as being a computer designer and releasing several albums and singles over the years.'" "'Thomas Dolby is known for being a British musician and computer designer, and his 1982 hit \"She Blinded Me With Science\".'"
] ]
}, },
"execution_count": 31, "execution_count": 32,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2861,7 +2846,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 33,
"id": "ba65b7e3", "id": "ba65b7e3",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2888,7 +2873,7 @@
"'The sum of 5 and 5 is 10.'" "'The sum of 5 and 5 is 10.'"
] ]
}, },
"execution_count": 32, "execution_count": 33,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -2923,7 +2908,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 34,
"id": "b8314f7b", "id": "b8314f7b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -2953,15 +2938,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 35,
"id": "4ee414f1", "id": "3e0ee344",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"['1. What is the difference between a virus and a bacteria?', '2. Who composed the musical Cats?', '3. How does the digestive system work?', '4. What are some major constellations visible in the night sky?', '5. Who discovered penicillin?', '6. What is the difference between climate and weather?', '7. Who invented the World Wide Web?', '8. What is the significance of the Mona Lisa painting?', '9. What is the tallest mountain in the world?', '10. Who won the Nobel Peace Prize in 2020?']\n" "['1. What is the difference between weather and climate?', '2. Who designed the Eiffel Tower?', '3. What is the capital of Australia?', '4. What is the chemical symbol for gold?', '5. Who invented the telephone?', '6. What is the largest organ in the human body?', '7. Which famous artist painted the Mona Lisa?', '8. What is the highest mountain in Africa?', '9. What famous building was destroyed during the September 11th attacks?', '10. Who wrote the novel \"To Kill a Mockingbird\"?']\n"
] ]
} }
], ],
@ -2972,7 +2957,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 36,
"id": "4446041c", "id": "4446041c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -3000,7 +2985,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 37,
"id": "b8ea3f6a", "id": "b8ea3f6a",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -3008,19 +2993,19 @@
"data": { "data": {
"text/plain": [ "text/plain": [
"(10,\n", "(10,\n",
" [('1. What is the difference between a virus and a bacteria?',\n", " [('1. What is the difference between weather and climate?',\n",
" 'A virus is a small infectious agent that can only replicate inside the living cells of other organisms, while bacteria are single-celled microorganisms that can reproduce on their own and can survive in a variety of environments.'),\n", " 'Weather refers to short-term atmospheric conditions in a specific area, while climate refers to long-term patterns and trends of weather in a particular region over a period of time.'),\n",
" ('2. Who composed the musical Cats?',\n", " ('2. Who designed the Eiffel Tower?',\n",
" 'Andrew Lloyd Webber composed the musical Cats.'),\n", " 'Gustave Eiffel designed the Eiffel Tower.'),\n",
" ('3. How does the digestive system work?',\n", " ('3. What is the capital of Australia?',\n",
" 'The digestive system works by breaking down food into smaller molecules that can be absorbed and used by the body, starting in the mouth and ending with elimination of waste through the rectum and anus.'),\n", " 'The capital of Australia is Canberra.'),\n",
" ('4. What are some major constellations visible in the night sky?',\n", " ('4. What is the chemical symbol for gold?',\n",
" 'Some major constellations that can be seen in the night sky include Orion, Ursa Major (also known as the Big Dipper), Cassiopeia, Leo, Scorpius, and Taurus.'),\n", " 'The chemical symbol for gold is Au.'),\n",
" ('5. Who discovered penicillin?',\n", " ('5. Who invented the telephone?',\n",
" 'Alexander Fleming discovered penicillin in 1928.')])" " 'Alexander Graham Bell invented the telephone.')])"
] ]
}, },
"execution_count": 36, "execution_count": 37,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -3031,7 +3016,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 38,
"id": "1cf3c9e1", "id": "1cf3c9e1",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -3079,7 +3064,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 38, "execution_count": 39,
"id": "8a351793", "id": "8a351793",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -3089,7 +3074,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 39, "execution_count": 40,
"id": "3b286e5f", "id": "3b286e5f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -3110,7 +3095,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": 41,
"id": "f18678e0", "id": "f18678e0",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -3130,19 +3115,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 41, "execution_count": 42,
"id": "cd29fc90", "id": "cd29fc90",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"correct 9\n", "correct 10\n",
"unable to answer 1\n",
"Name: evaluation, dtype: int64" "Name: evaluation, dtype: int64"
] ]
}, },
"execution_count": 41, "execution_count": 42,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }

Loading…
Cancel
Save