From 11bc21d87881dfcb245de067bba41bc677d321ad Mon Sep 17 00:00:00 2001 From: Colin Jarvis Date: Fri, 12 May 2023 14:37:22 +0100 Subject: [PATCH] Pushing updated version with comments fixed --- .../enterprise_knowledge_retrieval.ipynb | 202 ++++++++---------- 1 file changed, 93 insertions(+), 109 deletions(-) diff --git a/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb b/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb index 3626b065..a419411b 100644 --- a/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb +++ b/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb @@ -633,14 +633,7 @@ " 87% [................................................................ ] 3915776 / 4470649\r", " 87% [................................................................ ] 3923968 / 4470649\r", " 87% [................................................................. ] 3932160 / 4470649\r", - " 88% [................................................................. ] 3940352 / 4470649" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r", + " 88% [................................................................. ] 3940352 / 4470649\r", " 88% [................................................................. ] 3948544 / 4470649\r", " 88% [................................................................. ] 3956736 / 4470649\r", " 88% [................................................................. ] 3964928 / 4470649\r", @@ -660,7 +653,14 @@ " 91% [................................................................... ] 4079616 / 4470649\r", " 91% [................................................................... ] 4087808 / 4470649\r", " 91% [................................................................... ] 4096000 / 4470649\r", - " 91% [................................................................... ] 4104192 / 4470649\r", + " 91% [................................................................... ] 4104192 / 4470649" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", " 91% [.................................................................... ] 4112384 / 4470649\r", " 92% [.................................................................... ] 4120576 / 4470649\r", " 92% [.................................................................... ] 4128768 / 4470649\r", @@ -711,7 +711,7 @@ { "data": { "text/plain": [ - "'wikipedia_articles_2000 (1).csv'" + "'wikipedia_articles_2000 (2).csv'" ] }, "execution_count": 5, @@ -1084,7 +1084,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "eb5848f3", + "id": "4f4cf064", "metadata": {}, "outputs": [], "source": [ @@ -1155,36 +1155,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.08 s, sys: 135 ms, total: 1.21 s\n", - "Wall time: 1.3 s\n" + "CPU times: user 1.04 s, sys: 131 ms, total: 1.17 s\n", + "Wall time: 1.2 s\n" ] - }, - { - "data": { - "text/plain": [ - "0 None\n", - "1 None\n", - "2 None\n", - "3 None\n", - "4 None\n", - " ... \n", - "1995 None\n", - "1996 None\n", - "1997 None\n", - "1998 None\n", - "1999 None\n", - "Length: 2000, dtype: object" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "%%time\n", - "# This step takes about 13 minutes\n", - "\n", "# Initialise tokenizer\n", "tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n", "\n", @@ -1192,13 +1169,13 @@ "text_list = []\n", "\n", "# Process each PDF file and prepare for embedding\n", - "article_df.apply(lambda x: chunk_text(x, text_list),axis = 1)" + "x = article_df.apply(lambda x: chunk_text(x, text_list),axis = 1)" ] }, { "cell_type": "code", "execution_count": 14, - "id": "e3a6fb12", + "id": "6f259e2d", "metadata": {}, "outputs": [ { @@ -1223,7 +1200,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "83f0231c", + "id": "4fd503a0", "metadata": {}, "outputs": [ { @@ -1237,19 +1214,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "2752it [00:11, 244.58it/s] \n" + "2752it [00:10, 271.48it/s] \n" ] } ], "source": [ - "# Batch embed our chunked text\n", + "# Batch embed our chunked text - this will cost you about $0.50\n", "embeddings = embed_corpus([text[\"metadata\"]['content'] for text in text_list])" ] }, { "cell_type": "code", "execution_count": 16, - "id": "61c204c8", + "id": "49f75d78", "metadata": {}, "outputs": [ { @@ -2279,17 +2256,9 @@ { "cell_type": "code", "execution_count": 17, - "id": "b218a207", + "id": "932818e9", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|████████████████████████████████████████████████████████████████████| 27/27 [00:07<00:00, 3.49it/s]\n" - ] - } - ], + "outputs": [], "source": [ "# Create a Redis pipeline to load all the vectors and their metadata\n", "def load_vectors(client:r, input_list, vector_field_name):\n", @@ -2307,8 +2276,24 @@ " # HSET\n", " p.hset(key,mapping=item_metadata)\n", " \n", - " p.execute()\n", - "\n", + " p.execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b218a207", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████| 27/27 [00:07<00:00, 3.40it/s]\n" + ] + } + ], + "source": [ "batch_size = 100 # how many vectors we insert at once\n", "\n", "for i in tqdm(range(0, len(text_list), batch_size)):\n", @@ -2321,7 +2306,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "d3466f7d", "metadata": {}, "outputs": [ @@ -2331,7 +2316,7 @@ "'2693'" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2403,7 +2388,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "89da0c45", "metadata": {}, "outputs": [], @@ -2453,7 +2438,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "f0161a54", "metadata": {}, "outputs": [ @@ -2461,8 +2446,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 7.6 ms, sys: 2.34 ms, total: 9.95 ms\n", - "Wall time: 288 ms\n" + "CPU times: user 7.1 ms, sys: 2.35 ms, total: 9.45 ms\n", + "Wall time: 495 ms\n" ] }, { @@ -2500,7 +2485,7 @@ " https://simple.wikipedia.org/wiki/Thomas%20Dolby\n", " Thomas Dolby\n", " Title: Thomas Dolby;\\nThomas Dolby (born Thomas Morgan Robertson; 14 October 1958) is a British musican and computer designer. He is probably most famous for his 1982 hit, \"She Blinded me with Science\".\\n\\nHe married actress Kathleen Beller in 1988. The couple have three children together.\\n\\nDiscography\\n\\nSingles\\n\\nA Track did not chart in North America until 1983, after the success of \"She Blinded Me With Science\".\\n\\nAlbums\\n\\nStudio albums\\n\\nEPs\\n\\nReferences\\n\\nEnglish musicians\\nLiving people\\n1958 births\\nNew wave musicians\\nWarner Bros. Records artists\n", - " 0.132752358913\n", + " 0.132723689079\n", " \n", " \n", " 1\n", @@ -2508,7 +2493,7 @@ " https://simple.wikipedia.org/wiki/Synthesizer\n", " Synthesizer\n", " Title: Synthesizer;\\nAudio technology\n", - " 0.223150134087\n", + " 0.223129153252\n", " \n", " \n", "\n", @@ -2524,11 +2509,11 @@ "1 Title: Synthesizer;\\nAudio technology \n", "\n", " certainty \n", - "0 0.132752358913 \n", - "1 0.223150134087 " + "0 0.132723689079 \n", + "1 0.223129153252 " ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2544,7 +2529,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "48d136b0", "metadata": {}, "outputs": [], @@ -2577,7 +2562,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "06f6e6ed", "metadata": {}, "outputs": [ @@ -2585,7 +2570,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Thomas Dolby is known for being a British musician and computer designer. He is most famous for his 1982 hit, \"She Blinded me with Science\".\n" + "Thomas Dolby is known for his music, particularly his 1982 hit \"She Blinded Me With Science\". He is also a computer designer.\n" ] } ], @@ -2632,7 +2617,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "0ccca3da", "metadata": {}, "outputs": [], @@ -2649,8 +2634,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "ba73ffab", + "execution_count": 25, + "id": "63455a8e", "metadata": {}, "outputs": [], "source": [ @@ -2661,7 +2646,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "68a0b8dd", "metadata": {}, "outputs": [], @@ -2683,7 +2668,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "id": "39e101ee", "metadata": {}, "outputs": [], @@ -2716,7 +2701,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "a2b9f271", "metadata": {}, "outputs": [], @@ -2770,7 +2755,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "454c3ca9", "metadata": {}, "outputs": [], @@ -2790,7 +2775,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "34de07d2", "metadata": {}, "outputs": [], @@ -2813,7 +2798,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "d3603d58", "metadata": {}, "outputs": [], @@ -2823,7 +2808,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "6bfb594b", "metadata": {}, "outputs": [ @@ -2838,8 +2823,8 @@ "Action: Search\n", "Action Input: \"What is Thomas Dolby known for?\"\u001b[0m\n", "\n", - "Observation:\u001b[36;1m\u001b[1;3mThomas Dolby is known for his music career as a British musician and his hit song \"She Blinded me with Science\". He is also a computer designer and has released several albums and singles over the years.\u001b[0m\u001b[32;1m\u001b[1;3mNow I have a better understanding of who Thomas Dolby is and what he is known for.\n", - "Final Answer: Thomas Dolby is known for his music career as a British musician and his hit song \"She Blinded me with Science\", as well as being a computer designer and releasing several albums and singles over the years.\u001b[0m\n", + "Observation:\u001b[36;1m\u001b[1;3mThomas Dolby is known for being a British musician and computer designer, and his 1982 hit \"She Blinded Me With Science\".\u001b[0m\u001b[32;1m\u001b[1;3mNow that I know who Thomas Dolby is, I can answer the question.\n", + "Final Answer: Thomas Dolby is known for being a British musician and computer designer, and his 1982 hit \"She Blinded Me With Science\".\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] @@ -2847,10 +2832,10 @@ { "data": { "text/plain": [ - "'Thomas Dolby is known for his music career as a British musician and his hit song \"She Blinded me with Science\", as well as being a computer designer and releasing several albums and singles over the years.'" + "'Thomas Dolby is known for being a British musician and computer designer, and his 1982 hit \"She Blinded Me With Science\".'" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2861,7 +2846,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "ba65b7e3", "metadata": {}, "outputs": [ @@ -2888,7 +2873,7 @@ "'The sum of 5 and 5 is 10.'" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2923,7 +2908,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "id": "b8314f7b", "metadata": {}, "outputs": [], @@ -2953,15 +2938,15 @@ }, { "cell_type": "code", - "execution_count": 34, - "id": "4ee414f1", + "execution_count": 35, + "id": "3e0ee344", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['1. What is the difference between a virus and a bacteria?', '2. Who composed the musical Cats?', '3. How does the digestive system work?', '4. What are some major constellations visible in the night sky?', '5. Who discovered penicillin?', '6. What is the difference between climate and weather?', '7. Who invented the World Wide Web?', '8. What is the significance of the Mona Lisa painting?', '9. What is the tallest mountain in the world?', '10. Who won the Nobel Peace Prize in 2020?']\n" + "['1. What is the difference between weather and climate?', '2. Who designed the Eiffel Tower?', '3. What is the capital of Australia?', '4. What is the chemical symbol for gold?', '5. Who invented the telephone?', '6. What is the largest organ in the human body?', '7. Which famous artist painted the Mona Lisa?', '8. What is the highest mountain in Africa?', '9. What famous building was destroyed during the September 11th attacks?', '10. Who wrote the novel \"To Kill a Mockingbird\"?']\n" ] } ], @@ -2972,7 +2957,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "id": "4446041c", "metadata": {}, "outputs": [], @@ -3000,7 +2985,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "id": "b8ea3f6a", "metadata": {}, "outputs": [ @@ -3008,19 +2993,19 @@ "data": { "text/plain": [ "(10,\n", - " [('1. What is the difference between a virus and a bacteria?',\n", - " 'A virus is a small infectious agent that can only replicate inside the living cells of other organisms, while bacteria are single-celled microorganisms that can reproduce on their own and can survive in a variety of environments.'),\n", - " ('2. Who composed the musical Cats?',\n", - " 'Andrew Lloyd Webber composed the musical Cats.'),\n", - " ('3. How does the digestive system work?',\n", - " 'The digestive system works by breaking down food into smaller molecules that can be absorbed and used by the body, starting in the mouth and ending with elimination of waste through the rectum and anus.'),\n", - " ('4. What are some major constellations visible in the night sky?',\n", - " 'Some major constellations that can be seen in the night sky include Orion, Ursa Major (also known as the Big Dipper), Cassiopeia, Leo, Scorpius, and Taurus.'),\n", - " ('5. Who discovered penicillin?',\n", - " 'Alexander Fleming discovered penicillin in 1928.')])" + " [('1. What is the difference between weather and climate?',\n", + " 'Weather refers to short-term atmospheric conditions in a specific area, while climate refers to long-term patterns and trends of weather in a particular region over a period of time.'),\n", + " ('2. Who designed the Eiffel Tower?',\n", + " 'Gustave Eiffel designed the Eiffel Tower.'),\n", + " ('3. What is the capital of Australia?',\n", + " 'The capital of Australia is Canberra.'),\n", + " ('4. What is the chemical symbol for gold?',\n", + " 'The chemical symbol for gold is Au.'),\n", + " ('5. Who invented the telephone?',\n", + " 'Alexander Graham Bell invented the telephone.')])" ] }, - "execution_count": 36, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -3031,7 +3016,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "id": "1cf3c9e1", "metadata": {}, "outputs": [], @@ -3079,7 +3064,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "id": "8a351793", "metadata": {}, "outputs": [], @@ -3089,7 +3074,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "id": "3b286e5f", "metadata": {}, "outputs": [], @@ -3110,7 +3095,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "id": "f18678e0", "metadata": {}, "outputs": [], @@ -3130,19 +3115,18 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "id": "cd29fc90", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "correct 9\n", - "unable to answer 1\n", + "correct 10\n", "Name: evaluation, dtype: int64" ] }, - "execution_count": 41, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" }