diff --git a/examples/fine-tuned_qa/answers_with_ft.py b/examples/fine-tuned_qa/answers_with_ft.py new file mode 100644 index 00000000..32507e82 --- /dev/null +++ b/examples/fine-tuned_qa/answers_with_ft.py @@ -0,0 +1,150 @@ +import argparse + +import openai + + +def create_context( + question, search_file_id, max_len=1800, search_model="ada", max_rerank=10 +): + """ + Create a context for a question by finding the most similar context from the search file. + :param question: The question + :param search_file_id: The file id of the search file + :param max_len: The maximum length of the returned context (in tokens) + :param search_model: The search model to use + :param max_rerank: The maximum number of reranking + :return: The context + """ + results = openai.Engine(search_model).search( + search_model=search_model, + query=question, + max_rerank=max_rerank, + file=search_file_id, + return_metadata=True, + ) + returns = [] + cur_len = 0 + for result in results["data"]: + cur_len += int(result["metadata"]) + 4 + if cur_len > max_len: + break + returns.append(result["text"]) + return "\n\n###\n\n".join(returns) + + +def answer_question( + search_file_id="", + fine_tuned_qa_model="", + question="Which country won the European Football championship in 2021?", + max_len=1800, + search_model="ada", + max_rerank=10, + debug=False, + stop_sequence=["\n", "."], + max_tokens=100, +): + """ + Answer a question based on the most similar context from the search file, using your fine-tuned model. + :param question: The question + :param fine_tuned_qa_model: The fine tuned QA model + :param search_file_id: The file id of the search file + :param max_len: The maximum length of the returned context (in tokens) + :param search_model: The search model to use + :param max_rerank: The maximum number of reranking + :param debug: Whether to output debug information + :param stop_sequence: The stop sequence for Q&A model + :param max_tokens: The maximum number of tokens to return + :return: The answer + """ + context = create_context( + question, + search_file_id, + max_len=max_len, + search_model=search_model, + max_rerank=max_rerank, + ) + if debug: + print("Context:\n" + context) + print("\n\n") + try: + # fine-tuned models requires model parameter, whereas other models require engine parameter + model_param = ( + {"model": fine_tuned_qa_model} + if ":" in fine_tuned_qa_model + and fine_tuned_qa_model.split(":")[1].startswith("ft") + else {"engine": fine_tuned_qa_model} + ) + response = openai.Completion.create( + prompt=f"Answer the question based on the context below\n\nText: {context}\n\n---\n\nQuestion: {question}\nAnswer:", + temperature=0, + max_tokens=max_tokens, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + stop=stop_sequence, + **model_param, + ) + return response["choices"][0]["text"] + except Exception as e: + print(e) + return "" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Rudimentary functionality of the answers endpoint with a fine-tuned Q&A model.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--search_file_id", help="Search file id", required=True, type=str + ) + parser.add_argument( + "--fine_tuned_qa_model", help="Fine-tuned QA model id", required=True, type=str + ) + parser.add_argument( + "--question", help="Question to answer", required=True, type=str + ) + parser.add_argument( + "--max_len", + help="Maximum length of the returned context (in tokens)", + default=1800, + type=int, + ) + parser.add_argument( + "--search_model", help="Search model to use", default="ada", type=str + ) + parser.add_argument( + "--max_rerank", + help="Maximum number of reranking for the search", + default=10, + type=int, + ) + parser.add_argument( + "--debug", help="Print debug information (context used)", action="store_true" + ) + parser.add_argument( + "--stop_sequence", + help="Stop sequences for the Q&A model", + default=["\n", "."], + nargs="+", + type=str, + ) + parser.add_argument( + "--max_tokens", + help="Maximum number of tokens to return", + default=100, + type=int, + ) + args = parser.parse_args() + response = answer_question( + search_file_id=args.search_file_id, + fine_tuned_qa_model=args.fine_tuned_qa_model, + question=args.question, + max_len=args.max_len, + search_model=args.search_model, + max_rerank=args.max_rerank, + debug=args.debug, + stop_sequence=args.stop_sequence, + max_tokens=args.max_tokens, + ) + print(f"Answer:{response}") diff --git a/examples/fine-tuned_qa/olympics-1-collect-data.ipynb b/examples/fine-tuned_qa/olympics-1-collect-data.ipynb new file mode 100644 index 00000000..7a88051b --- /dev/null +++ b/examples/fine-tuned_qa/olympics-1-collect-data.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Collect Wikipedia data about Olympic Games 2020\n", + "\n", + "The idea of this project is to create a question answering model, based on a few paragraphs of provided text. Base GPT-3 models do a good job at answering questions when the answer is contained within the paragraph, however if the answer isn't contained, the base models tend to try their best to answer anyway, often leading to confabulated answers. \n", + "\n", + "To create a model which answers questions only if there is sufficient context for doing so, we first create a dataset of questions and answers based on paragraphs of text. In order to train the model to answer only when the answer is present, we also add adversarial examples, where the question doesn't match the context. In those cases, we ask the model to output \"No sufficient context for answering the question\". \n", + "\n", + "We will perform this task in three notebooks:\n", + "1. The first (this) notebook focuses on collecting recent data, which GPT-3 didn't see during it's pre-training. We picked the topic of Olympic Games 2020 (which actually took place in the summer of 2021), and downloaded 713 unique pages. We organized the dataset by individual sections, which will serve as context for asking and answering the questions.\n", + "2. The [second notebook](olympics-2-create-qa.ipynb) will utilize Davinci-instruct to ask a few questions based on a Wikipedia section, as well as answer those questions, based on that section.\n", + "3. The [third notebook](olympics-3-train-qa.ipynb) will utilize the dataset of context, question and answer pairs to additionally create adversarial questions and context pairs, where the question was not generated on that context. In those cases the model will be prompted to answer \"No sufficient context for answering the question\". We will also train a discriminator model, which predicts whether the question can be answered based on the context or not." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.1 Data extraction using the wikipedia API\n", + "Extracting the data will take about half an hour, and processing will likely take about as much." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "909" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import wikipedia\n", + "\n", + "\n", + "def filter_olympic_2020_titles(titles):\n", + " \"\"\"\n", + " Get the titles which are related to Olympic games hosted in 2020, given a list of titles\n", + " \"\"\"\n", + " titles = [title for title in titles if '2020' in title and 'olympi' in title.lower()]\n", + " \n", + " return titles\n", + "\n", + "def get_wiki_page(title):\n", + " \"\"\"\n", + " Get the wikipedia page given a title\n", + " \"\"\"\n", + " try:\n", + " return wikipedia.page(title)\n", + " except wikipedia.exceptions.DisambiguationError as e:\n", + " return wikipedia.page(e.options[0])\n", + " except wikipedia.exceptions.PageError as e:\n", + " return None\n", + "\n", + "def recursively_find_all_pages(titles, titles_so_far=set()):\n", + " \"\"\"\n", + " Recursively find all the pages that are linked to the Wikipedia titles in the list\n", + " \"\"\"\n", + " all_pages = []\n", + " \n", + " titles = list(set(titles) - titles_so_far)\n", + " titles = filter_olympic_2020_titles(titles)\n", + " titles_so_far.update(titles)\n", + " for title in titles:\n", + " page = get_wiki_page(title)\n", + " if page is None:\n", + " continue\n", + " all_pages.append(page)\n", + "\n", + " new_pages = recursively_find_all_pages(page.links, titles_so_far)\n", + " for pg in new_pages:\n", + " if pg.title not in [p.title for p in all_pages]:\n", + " all_pages.append(pg)\n", + " titles_so_far.update(page.links)\n", + " return all_pages\n", + "\n", + "\n", + "pages = recursively_find_all_pages([\"2020 Summer Olympics\"])\n", + "len(pages)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Filtering the Wikipedia pages and splitting them into sections by headings\n", + "We remove sections unlikely to contain textual information, and ensure that each section is not longer than the token limit" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('Bermuda at the 2020 Summer Olympics',\n", + " 'Equestrian',\n", + " \"Bermuda entered one dressage rider into the Olympic competition by finishing in the top four, outside the group selection, of the individual FEI Olympic Rankings for Groups D and E (North, Central, and South America), marking the country's recurrence to the sport after an eight-year absence. The quota was later withdrawn, following an injury of Annabelle Collins' main horse Joyero and a failure to obtain minimum eligibility requirements (MER) aboard a new horse Chuppy Checker.\",\n", + " 104)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "import re\n", + "from typing import Set\n", + "from transformers import GPT2TokenizerFast\n", + "\n", + "import numpy as np\n", + "from nltk.tokenize import sent_tokenize\n", + "\n", + "tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n", + "\n", + "def count_tokens(text: str) -> int:\n", + " \"\"\"count the number of tokens in a string\"\"\"\n", + " return len(tokenizer.encode(text))\n", + "\n", + "def reduce_long(\n", + " long_text: str, long_text_tokens: bool = False, max_len: int = 590\n", + ") -> str:\n", + " \"\"\"\n", + " Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end\n", + " \"\"\"\n", + " if not long_text_tokens:\n", + " long_text_tokens = count_tokens(long_text)\n", + " if long_text_tokens > max_len:\n", + " sentences = sent_tokenize(long_text.replace(\"\\n\", \" \"))\n", + " ntokens = 0\n", + " for i, sentence in enumerate(sentences):\n", + " ntokens += 1 + count_tokens(sentence)\n", + " if ntokens > max_len:\n", + " return \". \".join(sentences[:i][:-1]) + \".\"\n", + "\n", + " return long_text\n", + "\n", + "discard_categories = ['See also', 'References', 'External links', 'Further reading', \"Footnotes\",\n", + " \"Bibliography\", \"Sources\", \"Citations\", \"Literature\", \"Footnotes\", \"Notes and references\",\n", + " \"Photo gallery\", \"Works cited\", \"Photos\", \"Gallery\", \"Notes\", \"References and sources\",\n", + " \"References and notes\",]\n", + "\n", + "\n", + "def extract_sections(\n", + " wiki_text: str,\n", + " title: str,\n", + " max_len: int = 1500,\n", + " discard_categories: Set[str] = discard_categories,\n", + ") -> str:\n", + " \"\"\"\n", + " Extract the sections of a Wikipedia page, discarding the the references and other low information sections\n", + " \"\"\"\n", + " if len(wiki_text) == 0:\n", + " return []\n", + "\n", + " # find all headings and the coresponding contents\n", + " headings = re.findall(\"==+ .* ==+\", wiki_text)\n", + " for heading in headings:\n", + " wiki_text = wiki_text.replace(heading, \"==+ !! ==+\")\n", + " contents = wiki_text.split(\"==+ !! ==+\")\n", + " contents = [c.strip() for c in contents]\n", + " assert len(headings) == len(contents) - 1\n", + "\n", + " cont = contents.pop(0).strip()\n", + " outputs = [(title, \"Summary\", cont, count_tokens(cont)+4)]\n", + " \n", + " # discard the discard categories, accounting for a tree structure\n", + " max_level = 100\n", + " keep_group_level = max_level\n", + " remove_group_level = max_level\n", + " nheadings, ncontents = [], []\n", + " for heading, content in zip(headings, contents):\n", + " plain_heading = \" \".join(heading.split(\" \")[1:-1])\n", + " num_equals = len(heading.split(\" \")[0])\n", + " if num_equals <= keep_group_level:\n", + " keep_group_level = max_level\n", + "\n", + " if num_equals > remove_group_level:\n", + " if (\n", + " num_equals <= keep_group_level\n", + " ):\n", + " continue\n", + " keep_group_level = max_level\n", + " if plain_heading in discard_categories:\n", + " remove_group_level = num_equals\n", + " keep_group_level = max_level\n", + " continue\n", + " nheadings.append(heading.replace(\"=\", \"\").strip())\n", + " ncontents.append(content)\n", + " remove_group_level = max_level\n", + "\n", + " # count the tokens of each section\n", + " ncontent_ntokens = [\n", + " count_tokens(c)\n", + " + 3\n", + " + count_tokens(\" \".join(h.split(\" \")[1:-1]))\n", + " - (1 if len(c) == 0 else 0)\n", + " for h, c in zip(nheadings, ncontents)\n", + " ]\n", + "\n", + " # Create a tuple of (title, section_name, content, number of tokens)\n", + " outputs += [(title, h, c, t) if t 1024). Running this sequence through the model will result in indexing errors\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleheadingcontenttokens
02020 Summer OlympicsSummaryThe 2020 Summer Olympics (Japanese: 2020年夏季オリン...713
12020 Summer OlympicsHost city selectionThe International Olympic Committee (IOC) vote...126
22020 Summer OlympicsImpact of the COVID-19 pandemicIn January 2020, concerns were raised about th...369
32020 Summer OlympicsQualifying event cancellation and postponementConcerns about the pandemic began to affect qu...298
42020 Summer OlympicsEffect on doping testsMandatory doping tests were being severely res...163
\n", + "
" + ], + "text/plain": [ + " title heading \\\n", + "0 2020 Summer Olympics Summary \n", + "1 2020 Summer Olympics Host city selection \n", + "2 2020 Summer Olympics Impact of the COVID-19 pandemic \n", + "3 2020 Summer Olympics Qualifying event cancellation and postponement \n", + "4 2020 Summer Olympics Effect on doping tests \n", + "\n", + " content tokens \n", + "0 The 2020 Summer Olympics (Japanese: 2020年夏季オリン... 713 \n", + "1 The International Olympic Committee (IOC) vote... 126 \n", + "2 In January 2020, concerns were raised about th... 369 \n", + "3 Concerns about the pandemic began to affect qu... 298 \n", + "4 Mandatory doping tests were being severely res... 163 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res = []\n", + "for page in pages:\n", + " res += extract_sections(page.content, page.title)\n", + "df = pd.DataFrame(res, columns=[\"title\", \"heading\", \"content\", \"tokens\"])\n", + "df = df[df.tokens>40]\n", + "df = df.drop_duplicates(['title','heading'])\n", + "df = df.reset_index().drop('index',axis=1) # reset index\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save the section dataset\n", + "We will save the section dataset, for the [next notebook](olympics-2-create-qa.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('olympics-data/olympics_sections.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.3 (Optional) Exploring the data " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Concerns and controversies at the 2020 Summer Olympics 51\n", + "United States at the 2020 Summer Olympics 46\n", + "Great Britain at the 2020 Summer Olympics 42\n", + "Canada at the 2020 Summer Olympics 39\n", + "Olympic Games 39\n", + "Name: title, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.title.value_counts().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There appear to be winter and summer Olympics 2020. We chose to leave a little ambiguity and noise in the dataset, even though we were interested in only Summer Olympics 2020." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True 3567\n", + "False 305\n", + "Name: title, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.title.str.contains('Summer').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 3774\n", + "True 98\n", + "Name: title, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.title.str.contains('Winter').value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAr20lEQVR4nO3deZwcVbn/8c+XsCdI2MwNEAibelEuCBHxojgBZRfUHyhcwABRREFB8UrABRQRXABFEERZXQiIC7sIXCIqsgWBsEqAsIRNIIQEBAl5fn+c06Sm6emumUzNFJnv+/Xq11SdU8vTNdX1dNU5XaWIwMzMrJ3FBjsAMzOrPycLMzPryMnCzMw6crIwM7OOnCzMzKwjJwszM+vIyaJA0qmSvtZPy1pD0lxJw/L4FEmf7I9l5+VdLmlCfy2vF+v9lqSnJT0x0OtuiqNL0qODuP6PSHok/4/f2Q/LC0nr9kdsfVh3t3214nW99hlr9z+U9D5J91YUw1mSvlX1evqDpMMl/Wyw44AhlCwkzZD0L0lzJD0n6TpJ+0t6bRtExP4RcVTJZX2g3TQR8XBEjIiIV/sh9iMl/aJp+dtFxNkLu+xexrEGcAiwfkT8x0Cuu4a+DxyY/8d/b64czIN/by3MvirpXkkfL4xvnt97c9kcSYuX/YxFxJ8j4q29jae3Bmo9ZbRKnhHx7Yjoty+ZC2PIJIvsQxGxHLAmcCxwKHB6f69E0uL9vcyaWAN4JiKeGuxA+lMf/19rAnf2dyxvQNcCWxTGtwDuaVH2t4iYN5CBWT+LiCHxAmYAH2gq2xSYD7wjj58FfCsPrwxcAjwHPAv8mZRcf57n+RcwF/gyMBYIYCLwMOkD1ChbPC9vCnAMcCPwPHAhsGKu6wIebRUvsC3wb+CVvL7bCsv7ZB5eDPgq8BDwFHAOsHyua8QxIcf2NPCVNttp+Tz/P/PyvpqX/4H8nufnOM5qMW8X8Cjp7OMp4HFgn0L9azHn8b2BvxTGA/gscB8wBzgKWAe4Lm+z84Elm9Z1eH5PM4A9CstaivTt/2HgSeBUYJmmeQ8FngB+3uK9tNymeblzc6wvAPe3mPfaQv1c4OO5/FPAdNL+dBGwatN7XzcPvxd4BOjK4/sCdwOzgCuANZvm2z9vs+eAkwHlunWBPwGz8zY6r4f/eWMfKe6rRwF/zf+HPwIr9zDvXsC0wvhl+f/aXPbVFp+xLgr7PfB54C5g9RZ1M4DDcv0s4Exg6UL9jsCteRtcB/xXoe6dwC35vZwHTG4TwyTg/jztXcBH2nxWNgVuJu2bTwLHF+o2y3E8B9zW+F/muhVz/I/l9/J7YDjdP19zgVWBI4FfFObdifQl5bn8f/rPpm30JeD2/D8/r7GN6OF41qtjaJUH6Dq9aJEscvnDwGda7MjHkA4wS+TX+1jwIey2LBZ82M7J//RlaP0BnAm8I0/zm8ZO0LzDNq+jeYcpLK+RLPYlHYTWBkYAvyUfAAtx/DTHtSHwcnEna1ruOaREtlye9x/AxJ7ibJq3C5gHfDNvs+2BF4EVmmPO43vz+mRxIfAm4O05zqvz+1qe9OGd0LSu40kH8PeTDs5vzfUnkA7IK+b3cjFwTNO838nzLtPivfS4TQuxrttmW3SrB7YkHbA3zuv8EXBt8/SkLwePAJvm8p1zHP8JLE5KYNc1zXcJMJJ05vdPYNtcdy7wFVLiWxp4bw+xNvaR4r56P/CWvM9MAY7tYd41SQe4FfN6nsrzPFIomw1s0eIz1kXen4Cvkw7oq7Ta10ifhzuAMXm5fy0s5515ve8GhpG+GM3I23lJUsL/Ammf3IX0xaunZLEr6SC9GPBx0j41uof3/jdgrzw8AtgsD68GPEPa/xcDPpjHG+/tUtKBfIUc0/vbHAeOZMFx4i05ng/m+b5M2jeWLGyjG3P8K5K+YOzf6XhW9jXULkO18hhpwzZ7BRhN+hb3SqRrm9FhWUdGxAsR8a8e6n8eEXdExAvA14CP9VOj4h6kbzUPRMRc0jew3Zour3wjIv4VEbeRvuls2LyQHMtuwGERMSciZgDHkb49lvUK8M28zS4jfUPqzTXh70bE8xFxJ+ng8Mf8vmYDl5MODEVfi4iXI+JPpA/hxyQJ2A/4QkQ8GxFzgG/n99YwHzgiz9vq/1Vmm/bGHsAZEXFLRLycl/ceSWML0+wK/ATYLiJuzGX7k5Lc3ZEu43wb2EjSmoX5jo2I5yLiYeAaYKNc/grpYL5qRLwUEX/pRbxnRsQ/8rY5v7DMbiLiIdIXrveR9qn78jx/LZQtCdzQw3ok6Xhga2B8RPyzTUwnRcQjEfEscDSwey7fD/hJRNwQEa9Gast7mfTtfjPSwfEHeZ+8ALippxVExK8j4rGImB8R55HO2DbtYfJXgHUlrRwRcyPi+ly+J3BZRFyWl3Ml6Qxke0mjge1IB/FZOaY/tXnPRR8HLo2IKyPiFdKZ8zLAfxemOTHH/yzpC9JGhVh7ezzrxskifQt4tkX590hZ+4+SHpA0qcSyHulF/UOknXjlUlG2t2peXnHZiwOjCmXF3ksvkr4JNVs5x9S8rNV6Ecsz0f3adE/r6smTheF/tRgvLmtWTrwND5G2xSrAssDU3JnhOeAPubzhnxHxUps4ymzT3ui2vJyAnqH7tj0YOD8i7iiUrQn8sPA+ngXUNF9P/9sv52lvlHSnpH17EW+Z/aWh0W6xBenyBsBfCmU35gTZykjSwf6Y/IWgnebPz6p5eE3gkMY2yttpTK5fFZjZdGAs/l+7kfQJSbcWlvMOev6MTiR9279H0k2SdizEs2tTPO8lHazHAM9GxKwO77WV5n1oPmmblNkX+nI862ZIJwtJ7yJt6Nd948rfrA+JiLVJ1wm/KGmrRnUPi+yUqccUhtcgZfunSaeWyxbiGkb3A1un5T5G2kGLy55H9wNtGU+z4NtocVkze7mcnnR7n8DC9qhaQdLwwvgapG3xNCmxvD0iRubX8hFRPOAN1DZtubwc90p037a7Ah+WdFCh7BHg04X3MTIilomI6zqtMCKeiIhPRcSqwKeBH1fUQ6uRLN7HgmTx50LZtW3mnUVqbzhT0uYd1tP8+XksDz8CHN20jZaNiHNJ7War5bPN4ryvk8/WfgocCKwUESNJZ7dqNX1E3BcRuwNvJl3SvCD/Xx8hXUUoxjM8Io7NdStKGtlqkR3ef/M+JNI26fj57HA8K2VIJgtJb8rfAiaTrgdOazHNjpLWzf+Q2cCrpEsXkA4Ya/dh1XtKWl/SsqTr+hdE6q74D2BpSTtIWoJ0XXqpwnxPAmOL3XybnAt8QdJakkaQLlWcF73sfZJjOR84WtJy+cPzReAX7ecs7Vbgo5KWzQetif2wzG9IWlLS+0gHnV/nb1w/BU6Q9GYASatJ2qYXy13Ybdq8j5wL7CNpI0lL5eXdkC/1NTwGbAUcJOkzuexU4DBJb8/vY3lJu5YJQNKuklbPo7NIB6P5bWbpq2tJlwe3IF1+ApgGrAWMp32yICKmkC7T/VZST5d8AA6QtLqkFUltMefl8p8C+0t6t5Lh+bO0HKldYR7weUlLSPooPV9WGk7aRv8EkLQP6cyiJUl7Slol72/P5eL5pM/LhyRtI2mYpKVzt9jVI+Jx0uXUH0taIcfU6Dn2JLCSpOV7WOX5wA6StsrHiUNIl9s6fnHocDwrZagli4slzSFl96+QGkf36WHa9YCrSNfc/wb8OCKuyXXHAF/Np5hf6sX6f05q4HuC1OD4eYB8+v1Z4GekbwkvkHrrNPw6/31G0i0tlntGXva1wIPAS8DnehFX0efy+h8gnXH9Ki+/P5xA6tn1JHA28MuFXN4TpIPgY3lZ+0fEPbnuUNJp9/WSnif9L3vTdrKw2/RI4Oy8j3wsIq4itVP9hvRtdx26t6EA6TcPpIQxSdInI+J3pG+tk/P7uIN0zbuMdwE3SJpLauw/KCIe6MV7KCUi/kE6wD4REc/lsvmkxtY3UeJglq/r70v6jG7cw2S/IvXMeoDUAP+tPO/NpJ5mJ5H2h+mkzhNExL+Bj+bxZ0nX/X/bQwx3kdro/kbaRzdgQfJrZVvgzrx9fwjsFqld8BFSx4TDSdvlEeB/WXC83Yt0Bn8PqWH+4Lz+e0hfKh7I+82qhXUREfeS2kN+RDp7/hDp5wD/bhNjQ7vjWSmN3j1mZrUlaQapJ91Vgx3LUDXUzizMzKwPnCzMzKwjX4YyM7OOfGZhZmYdLZI3vFt55ZVj7NixLeteeOEFhg8f3rKubhxrNRxrNRxrNQYy1qlTpz4dEau0rIyFvOdSHV+bbLJJ9OSaa67psa5uHGs1HGs1HGs1BjJW4ObwvaHMzKyvnCzMzKwjJwszM+vIycLMzDpysjAzs46cLMzMrCMnCzMz68jJwszMOnKyMDOzjhbJ230srLGTLh2U9c44dodBWa+ZWSc+szAzs46cLMzMrCMnCzMz68jJwszMOnKyMDOzjpwszMysIycLMzPrqLJkIWlpSTdKuk3SnZK+kcvXknSDpOmSzpO0ZC5fKo9Pz/VjC8s6LJffK2mbqmI2M7PWqjyzeBnYMiI2BDYCtpW0GfAd4ISIWBeYBUzM008EZuXyE/J0SFof2A14O7At8GNJwyqM28zMmlSWLPIjXefm0SXyK4AtgQty+dnAh/PwznmcXL+VJOXyyRHxckQ8CEwHNq0qbjMze71K2ywkDZN0K/AUcCVwP/BcRMzLkzwKrJaHVwMeAcj1s4GViuUt5jEzswFQ6b2hIuJVYCNJI4HfAW+ral2S9gP2Axg1ahRTpkxpOd3cuXN7rGs4ZIN5beur0hxXmVjrwrFWw7FWw7H23oDcSDAinpN0DfAeYKSkxfPZw+rAzDzZTGAM8KikxYHlgWcK5Q3FeYrrOA04DWDcuHHR1dXVMpYpU6bQU13D3oN1I8E9urqNl4m1LhxrNRxrNRxr71XZG2qVfEaBpGWADwJ3A9cAu+TJJgAX5uGL8ji5/v8iInL5brm31FrAesCNVcVtZmavV+WZxWjg7NxzaTHg/Ii4RNJdwGRJ3wL+Dpyepz8d+Lmk6cCzpB5QRMSdks4H7gLmAQfky1tmZjZAKksWEXE78M4W5Q/QojdTRLwE7NrDso4Gju7vGM3MrBz/gtvMzDpysjAzs46cLMzMrCMnCzMz68jJwszMOnKyMDOzjpwszMysIycLMzPrqGOykLS5pOF5eE9Jx0tas/rQzMysLsqcWZwCvChpQ+AQ0m3Gz6k0KjMzq5UyyWJevqHfzsBJEXEysFy1YZmZWZ2UuTfUHEmHAXsCW0hajPTUOzMzGyLKnFl8nPQ87YkR8QTpeRLfqzQqMzOrlY5nFjlBHF8Yfxi3WZiZDSllekN9VNJ9kmZLel7SHEnPD0RwZmZWD2XaLL4LfCgi7q46GDMzq6cybRZPOlGYmQ1tZc4sbpZ0HvB7UkM3ABHx26qCMjOzeimTLN4EvAhsXSgLwMnCzGyIKNMbap+BCMTMzOqrTG+o1SX9TtJT+fUbSasPRHBmZlYPZRq4zwQuAlbNr4tzmZmZDRFlksUqEXFmRMzLr7OAVSqOy8zMaqRMsngm35p8WH7tCTxTdWBmZlYfZZLFvsDHgCeAx4FdgI6N3pLGSLpG0l2S7pR0UC4/UtJMSbfm1/aFeQ6TNF3SvZK2KZRvm8umS5rU2zdpZmYLp0xvqIeAnfqw7HnAIRFxi6TlgKmSrsx1J0TE94sTS1of2A14O6lt5CpJb8nVJwMfBB4FbpJ0UUTc1YeYzMysD3pMFpK+HBHflfQj0u8quomIz7dbcEQ8TjoTISLmSLobWK3NLDsDkyPiZeBBSdOBTXPd9Ih4IMc1OU/rZGFmNkCUnmvUokL6UERcLGlCq/qIOLv0SqSxwLXAO4AvAnsDzwM3k84+Zkk6Cbg+In6R5zkduDwvYtuI+GQu3wt4d0Qc2LSO/YD9AEaNGrXJ5MmTW8Yyd+5cRowY0TbeaTNnl31r/WqD1ZbvNl4m1rpwrNVwrNVwrK2NHz9+akSMa1XX45lFRFycB1+MiF8X6yTtWnblkkYAvwEOjojnJZ0CHEU6WzkKOI7ULrJQIuI04DSAcePGRVdXV8vppkyZQk91DXtPunRhw+mTGXt0dRsvE2tdONZqONZqONbeK9PAfVjJsteRtAQpUfyycS+piHgyIl6NiPnAT1lwqWkmMKYw++q5rKdyMzMbIO3aLLYDtgdWk3RioepNpMbrtiQJOB24OyKOL5SPzu0ZAB8B7sjDFwG/knQ8qYF7PeBGQMB6ktYiJYndgP8p9/bMzKw/tOsN9RipTWEnYGqhfA7whRLL3hzYC5gm6dZcdjiwu6SNSJehZgCfBoiIOyWdT2q4ngccEBGvAkg6ELgCGAacERF3lli/mZn1k3ZtFrcBt0n6HfBC4cA9DFiq04Ij4i+ks4Jml7WZ52jg6Bbll7Wbz8zMqlWmzeKPwDKF8WWAq6oJx8zM6qhMslg6IuY2RvLwstWFZGZmdVMmWbwgaePGiKRNgH9VF5KZmdVNmSflHQz8WtJjpDaI/wA+XmVQZmZWL2XuDXWTpLcBb81F90bEK9WGZWZmdVLmSXnLAocCB0XEHcBYSTtWHpmZmdVG2Sfl/Rt4Tx6fCXyrsojMzKx2yiSLdSLiu8ArABHxIq1/P2FmZouoMsni35KWId+mXNI6wMuVRmVmZrVSpjfUEcAfgDGSfkm6jcfeVQZlZmb1UqY31JWSbgE2I11+Oiginq48MjMzq40yvaE2B16KiEuBkcDhktasOjAzM6uPMm0WpwAvStqQ9JS7+4FzKo3KzMxqpUyymBfp2as7AydHxMnActWGZWZmdVKmgXuOpMOAPYEtJC0GLFFtWGZmVidlziw+TuoqOzEiniA91vR7lUZlZma1UqY31BPA8YXxh3GbhZnZkFLmzMLMzIY4JwszM+vIycLMzDrq2GYhaT3gGGB9YOlGeUSsXWFcZmZWI2VvUX4KMA8YT2rc/kWVQZmZWb2USRbLRMTVgCLioYg4Etih2rDMzKxOyiSLl/MP8e6TdKCkjwAjOs0kaYykayTdJelOSQfl8hUlXSnpvvx3hVwuSSdKmi7pdkkbF5Y1IU9/n6QJfXyvZmbWR2WSxUHAssDngU2AvYAyB+x5wCERsT7pjrUHSFofmARcHRHrAVfncYDtgPXyaz/SpS8krUi6Tfq7gU2BIxoJxszMBkaZH+XdlAfnAvuUXXBEPA48nofnSLobWI10j6muPNnZwBTSM753Bs7J96G6XtJISaPztFdGxLMAkq4EtgXOLRuLmZktnB6ThaQfRMTBki4mPyWvKCJ2KrsSSWOBdwI3AKNyIgF4AhiVh1cDHinM9mgu66nczMwGSLszi5/nv99fmBVIGgH8Bjg4Ip6XFjy+OyJC0usSUR/Xsx/p8hWjRo1iypQpLaebO3duj3UNh2wwrz9C6rXmuMrEWheOtRqOtRqOtfd6TBYRMTX//VNfFy5pCVKi+GVE/DYXPylpdEQ8ni8zPZXLZwJjCrOvnstmsuCyVaN8Sot4TwNOAxg3blx0dXU1TwKkA3JPdQ17T7q0bX1VZuzR1W28TKx14Vir4Vir4Vh7r8cGbknTcq+klq9OC1Y6hTgduDsiji9UXcSCBvIJwIWF8k/kXlGbAbPz5aorgK0lrZAbtrfOZWZmNkDaXYbaMf89IP9tXJbakxZtGC1sTuo5NU3SrbnscOBY4HxJE4GHgI/lusuA7YHpwIvkxvSIeFbSUUCjof2bjcZuMzMbGO0uQz0EIOmDEfHOQtWhkm5hQZfXnub/C6AeqrdqMX2wIDE1150BnNFufWZmVp0yv7OQpM0LI/9dcj4zM1tElHms6kTgDEnL5/HngH0ri8jMzGqnzI/ypgIbNpJFRMyuPCozM6uVjpeTJI2SdDowOSJmS1o/N06bmdkQUabt4SxSV9VV8/g/gIMrisfMzGqoTLJYOSLOB+YDRMQ84NVKozIzs1opkyxekLQS+bcVjR/MVRqVmZnVSpneUF8k/bp6HUl/BVYBdqk0KjMzq5UyvaFukfR+4K2kH9ndGxGvVB6ZmZnVRsdkIWlp4LPAe0mXov4s6dSIeKnq4MzMrB7KXIY6B5gD/CiP/w/pPlG7VhWUmZnVS5lk8Y78aNSGayTdVVVAZmZWP2V6Q92Se0ABIOndwM3VhWRmZnVT5sxiE+A6SQ/n8TWAeyVNI90s9r8qi87MzGqhTLLYtvIozMys1npMFpLeFBHPkxq3X8cPIDIzGzranVn8ivS0vKmkLrPFBxkFsHaFcZmZWY20e1LejvnvWs11+fnaZmY2RJS5Rfk3m8YXA35RWURmZlY7ZbrOjpF0GICkpYDfAfdVGpWZmdVKmWSxL7BBThgXA9dExJGVRmVmZrXSrjfUxoXRHwI/Af4KXCtp44i4pergzMysHtr1hjquaXwWsH4uD2DLqoIyM7N6adcbavxABmJmZvXVY5uFpD3z3y+2enVasKQzJD0l6Y5C2ZGSZkq6Nb+2L9QdJmm6pHslbVMo3zaXTZc0qe9v1czM+qrdZajh+e9yfVz2WcBJpFucF50QEd8vFkhaH9gNeDuwKnCVpLfk6pOBDwKPAjdJuigifNdbM7MB1C5ZPCnpzRHxjb4sOCKulTS25OQ7A5Mj4mXgQUnTgU1z3fSIeABA0uQ8rZOFmdkAUkS0rpAuAN4DvAhcR+oJdV1E3NFyhtbLGAtcEhHvyONHAnsDz5Nuc35IRMySdBJwfUT8Ik93OnB5Xsy2EfHJXL4X8O6IOLDFuvYD9gMYNWrUJpMnT24Z09y5cxkxYkTbuKfNnF32LfarDVZbvtt4mVjrwrFWw7FWw7G2Nn78+KkRMa5VXbsG7l0AJK1FShr/DXxa0hrATRGxfU/ztnEKcBSpN9VRpJ5V+/ZhOa3iPQ04DWDcuHHR1dXVcropU6bQU13D3pMu7Y+Qem3GHl3dxsvEWheOtRqOtRqOtfc63qI8Ih7Mv9xeJr+Wzn97LSKebAxL+ilwSR6dCYwpTLp6LqNNuZmZDZB2vaEOl3SxpOuBw4AlSQ3W/9XXbrWSRhdGPwI0LmldBOwmaal8JrMecCNwE7CepLUkLUlqBL+oL+s2M7O+a3dm8QngBdItPq4DboiI0hfzJZ0LdAErS3oUOALokrQR6TLUDODTABFxp6TzSQ3X84ADIuLVvJwDgSuAYcAZEXFnL96fmZn1g3ZtFm+TtCKpraILmCRpBHAbqaH7zHYLjojdWxSf3mb6o4GjW5RfBlzWbl1mZlattm0W+Wl4l0j6A+lZ3FuQzgb2BdomCzMzW3S0u5HgTqSzis1JP5a7k9R99hDSZSkzMxsi2p1Z7E1KDl8GpkbEvwckIjMzq512bRYfHchAzMysvso8/MjMzIY4JwszM+uo3Y/yrs5/vzNw4ZiZWR21a+AeLem/gZ3y3V5VrPRjVc3Mho52yeLrwNdI92M6vqnOj1U1MxtC2vWGugC4QNLXIuKoAYzJzMxqpsxdZ4/KP9DbIhdNiYhL2s1jZmaLlo69oSQdAxxEusnfXcBBkr5ddWBmZlYfHc8sgB2AjSJiPoCks4G/A4dXGZiZmdVH2d9ZjCwML9/TRGZmtmgqc2ZxDPB3SdeQus9uAUyqNCozM6uVMg3c50qaArwrFx0aEU9UGpWZmdVKmTMLIuJx/DhTM7Mhy/eGMjOzjpwszMyso7bJQtIwSfcMVDBmZlZPbZNFRLwK3CtpjQGKx8zMaqhMA/cKwJ2SbgReaBRGxE6VRWVmZrVSJll8rfIozMys1sr8zuJPktYE1ouIqyQtCwyrPjQzM6uLMjcS/BRwAfCTXLQa8PsS850h6SlJdxTKVpR0paT78t8VcrkknShpuqTbJW1cmGdCnv4+SRN6+f7MzKwflOk6ewCwOfA8QETcB7y5xHxnAds2lU0Cro6I9YCrWXDbkO2A9fJrP+AUSMkFOAJ4N7ApcEQjwZiZ2cApkyxejoh/N0YkLU56Ul5bEXEt8GxT8c7A2Xn4bODDhfJzIrkeGClpNLANcGVEPBsRs4AreX0CMjOziimi/XFf0neB54BPAJ8DPgvcFRFf6bhwaSxwSUS8I48/FxEj87CAWRExUtIlwLER8ZdcdzVwKNAFLB0R38rlXwP+FRHfb7Gu/UhnJYwaNWqTyZMnt4xp7ty5jBgxom3c02bO7vTWKrHBat1v6Fsm1rpwrNVwrNVwrK2NHz9+akSMa1VXpjfUJGAiMA34NHAZ8LOFDSoiQlLHM5ReLO804DSAcePGRVdXV8vppkyZQk91DXtPurS/wuqVGXt0dRsvE2tdONZqONZqONbeK9Mban5+4NENpMtP90an05GePSlpdEQ8ni8zPZXLZwJjCtOtnstmks4uiuVT+rhuMzProzK9oXYA7gdOBE4Cpkvaro/ruwho9GiaAFxYKP9E7hW1GTA73+n2CmBrSSvkhu2tc5mZmQ2gMpehjgPGR8R0AEnrAJcCl7ebSdK5pLOClSU9SurVdCxwvqSJwEPAx/LklwHbA9OBF4F9ACLiWUlHATfl6b4ZEc2N5mZmVrEyyWJOI1FkDwBzOs0UEbv3ULVVi2mD1EW31XLOAM4oEaeZmVWkx2Qh6aN58GZJlwHnk9osdmXBN30zMxsC2p1ZfKgw/CTw/jz8T2CZyiIyM7Pa6TFZRMQ+AxmImZnVV8c2C0lrkX6MN7Y4vW9RbmY2dJRp4P49cDpwMTC/0mjMzKyWyiSLlyLixMojMTOz2iqTLH4o6Qjgj8DLjcKIuKWyqMzMrFbKJIsNgL2ALVlwGSryuJmZDQFlksWuwNrF25SbmdnQUuZ5FncAIyuOw8zMaqzMmcVI4B5JN9G9zcJdZ83MhogyyeKIyqMwM7NaK/M8iz8NRCBmZlZfZX7BPYcFz9xeElgCeCEi3lRlYGZmVh9lziyWawzn52bvDGxWZVBmZlYvZXpDvSaS3wPbVBOOmZnVUZnLUB8tjC4GjANeqiwiMzOrnTK9oYrPtZgHzCBdijIzsyGiTJuFn2thZjbEtXus6tfbzBcRcVQF8ZiZWQ21O7N4oUXZcGAisBLgZGFmNkS0e6zqcY1hScsBBwH7AJOB43qaz8zMFj1t2ywkrQh8EdgDOBvYOCJmDURgZmZWHz3+zkLS94CbgDnABhFxZH8lCkkzJE2TdKukm3PZipKulHRf/rtCLpekEyVNl3S7pI37IwYzMyuv3Y/yDgFWBb4KPCbp+fyaI+n5flj3+IjYKCLG5fFJwNURsR5wdR4H2A5YL7/2A07ph3WbmVkvtGuz6NWvu/vBzkBXHj4bmAIcmsvPiYgArpc0UtLoiHh8gOMzMxuylI7BA7xS6UFgFukGhT+JiNMkPRcRI3O9gFkRMVLSJcCxEfGXXHc1cGhE3Ny0zP1IZx6MGjVqk8mTJ7dc99y5cxkxYkTb+KbNnL0wb6/PNlht+W7jZWKtC8daDcdaDcfa2vjx46cWrvZ0U+YX3FV4b0TMlPRm4EpJ9xQrIyIk9SqLRcRpwGkA48aNi66urpbTTZkyhZ7qGvaedGlvVt1vZuzR1W28TKx14Vir4Vir4Vh7b6AvNQEQETPz36eA3wGbAk9KGg2Q/z6VJ58JjCnMvnouMzOzATLgyULS8Py7DSQNB7YmPef7ImBCnmwCcGEevgj4RO4VtRkw2+0VZmYDazAuQ40CfpeaJVgc+FVE/CE/4/t8SROBh4CP5ekvA7YHpgMvkn4YaGZmA2jAk0VEPABs2KL8GWCrFuUBHDAAoQ26sU1tJYdsMG/A2k9mHLvDgKzHzN6YBqXNwszM3licLMzMrCMnCzMz68jJwszMOnKyMDOzjpwszMysIycLMzPryMnCzMw6crIwM7OOnCzMzKwjJwszM+vIycLMzDpysjAzs46cLMzMrCMnCzMz68jJwszMOnKyMDOzjgbjsapWQ81P6eutvj7Vz0/oM3tj8JmFmZl15GRhZmYdOVmYmVlHThZmZtaRk4WZmXXkZGFmZh29YbrOStoW+CEwDPhZRBw7yCFZP1jYLrt90dduvv3F3YXtjegNcWYhaRhwMrAdsD6wu6T1BzcqM7Oh441yZrEpMD0iHgCQNBnYGbhrUKMy64PenE0N9llQb7SL1WdTb3yKiMGOoSNJuwDbRsQn8/hewLsj4sDCNPsB++XRtwL39rC4lYGnKwy3PznWajjWajjWagxkrGtGxCqtKt4oZxYdRcRpwGmdppN0c0SMG4CQFppjrYZjrYZjrUZdYn1DtFkAM4ExhfHVc5mZmQ2AN0qyuAlYT9JakpYEdgMuGuSYzMyGjDfEZaiImCfpQOAKUtfZMyLizj4uruOlqhpxrNVwrNVwrNWoRaxviAZuMzMbXG+Uy1BmZjaInCzMzKyjIZMsJG0r6V5J0yVNqkE8YyRdI+kuSXdKOiiXryjpSkn35b8r5HJJOjHHf7ukjQch5mGS/i7pkjy+lqQbckzn5c4HSFoqj0/P9WMHOM6Rki6QdI+kuyW9p67bVdIX8v//DknnSlq6LttV0hmSnpJ0R6Gs19tR0oQ8/X2SJgxgrN/L+8Dtkn4naWSh7rAc672StimUV36caBVroe4QSSFp5Tw+qNu1m4hY5F+kRvH7gbWBJYHbgPUHOabRwMZ5eDngH6RbmXwXmJTLJwHfycPbA5cDAjYDbhiEmL8I/Aq4JI+fD+yWh08FPpOHPwucmod3A84b4DjPBj6Zh5cERtZxuwKrAQ8CyxS259512a7AFsDGwB2Fsl5tR2BF4IH8d4U8vMIAxbo1sHge/k4h1vXzMWApYK18bBg2UMeJVrHm8jGkTjwPASvXYbt2i6/qD0QdXsB7gCsK44cBhw12XE0xXgh8kPTL89G5bDRwbx7+CbB7YfrXphug+FYHrga2BC7JO+/ThQ/ja9s47/DvycOL5+k0QHEunw/Aaiqv3XYlJYtH8gd+8bxdt6nTdgXGNh2Ae7Udgd2BnxTKu01XZaxNdR8BfpmHu33+G9t1II8TrWIFLgA2BGawIFkM+nZtvIbKZajGh7Lh0VxWC/lywjuBG4BREfF4rnoCGJWHB/s9/AD4MjA/j68EPBcR81rE81qsuX52nn4grAX8EzgzXzL7maTh1HC7RsRM4PvAw8DjpO00lXpu14bebsfB3m8b9iV9Q4caxippZ2BmRNzWVFWbWIdKsqgtSSOA3wAHR8TzxbpIXxkGvW+zpB2BpyJi6mDHUsLipFP8UyLincALpMslr6nRdl2BdEPMtYBVgeHAtoMaVC/UZTt2IukrwDzgl4MdSyuSlgUOB74+2LG0M1SSRS1vFyJpCVKi+GVE/DYXPylpdK4fDTyVywfzPWwO7CRpBjCZdCnqh8BISY0fdhbjeS3WXL888MwAxfoo8GhE3JDHLyAljzpu1w8AD0bEPyPiFeC3pG1dx+3a0NvtOKifPUl7AzsCe+TkRpuYBivWdUhfGG7Ln7HVgVsk/UedYh0qyaJ2twuRJOB04O6IOL5QdRHQ6NkwgdSW0Sj/RO4dsRkwu3A5oFIRcVhErB4RY0nb7v8iYg/gGmCXHmJtvIdd8vQD8g00Ip4AHpH01ly0FelW9rXbrqTLT5tJWjbvD41Ya7ddC3q7Ha8Atpa0Qj6T2jqXVU7pgWlfBnaKiBeb3sNuuXfZWsB6wI0M0nEiIqZFxJsjYmz+jD1K6vzyBHXarlU2iNTpRepV8A9Sb4ev1CCe95JO4W8Hbs2v7UnXoK8G7gOuAlbM04v0AKj7gWnAuEGKu4sFvaHWJn3IpgO/BpbK5Uvn8em5fu0BjnEj4Oa8bX9P6i1Sy+0KfAO4B7gD+Dmph04ttitwLqkt5RXSAWxiX7Yjqb1gen7tM4CxTidd1298vk4tTP+VHOu9wHaF8sqPE61ibaqfwYIG7kHdrsWXb/dhZmYdDZXLUGZmthCcLMzMrCMnCzMz68jJwszMOnKyMDOzjpws7A0v36XzuML4lyQd2U/LPkvSLp2nXOj17Kp0h9xrmsrHSvqfEvPvLemk6iK0oc7JwhYFLwMfbdzWuS4Kv8IuYyLwqYgY31Q+FuiYLMyq5mRhi4J5pOcUf6G5ovnMQNLc/LdL0p8kXSjpAUnHStpD0o2Spklap7CYD0i6WdI/8n2yGs/2+J6km/JzBj5dWO6fJV1E+jV2czy75+XfIek7uezrpB9pni7pe02zHAu8T9KtSs++WFrSmXkZf5fUnFyQtIOkv0laWdLWefgWSb/O9yJD0gxJ38jl0yS9LZe/P6/r1rz85cr/G2xR5mRhi4qTgT0kLd+LeTYE9gf+E9gLeEtEbAr8DPhcYbqxwKbADsCpkpYmnQnMjoh3Ae8CPpVvHQHpXlQHRcRbiiuTtCrpuQpbkn5l/i5JH46Ib5J+cb5HRPxvU4yTgD9HxEYRcQJwAOkefhuQblN9do6nsY6P5Hm2z0VfBT4QERvndXyxsOync/kpwJdy2ZeAAyJiI+B9wL/ab0IbKpwsbJEQ6Y695wCf78VsN0XE4xHxMul2Cn/M5dNICaLh/IiYHxH3kR4y8zbSvXg+IelW0q3lVyLdYwjgxoh4sMX63gVMiXTjwMZdULfoRbyQzkB+ARAR95AelNNISlsChwI7RMQs0sNy1gf+muOcAKxZWFbj5pVTC+/3r8Dxkj4PjIwFt0q3Ic7JwhYlPyB94x9eKJtH3s8lLUZ6AlrDy4Xh+YXx+aRbnTc03xMnSPfs+Vz+xr9RRKwVEY1k88LCvImFcD/pqYuN5CHgykKM60fExML0jff7Kvn9RsSxwCeBZUhJ5m0DE7rVnZOFLTIi4lnSI0mLB8QZwCZ5eCdgiT4seldJi+V2jLVJN5+7AviM0m3mkfQWpYcstXMj8P7cljCMdBnpTx3mmUNKAA1/BvZorBNYI8cD6Szj/wHnSHo7cD2wuaR18/TD8zw9krROpLugfod0F1YnCwOcLGzRcxxQ7BX1U9IB+jbSYzP78q3/YdKB/nJg/4h4idSucRfpuQN3kB5r2bb3U6RbS08i3YL8NmBqRFzYbh7SnXNflXSbpC8APwYWkzQNOA/YO19Ga6zjHlIy+TXwJtIzvc+VdDvwNzof/A/Oje+3k+6KenmH6W2I8F1nzcysI59ZmJlZR04WZmbWkZOFmZl15GRhZmYdOVmYmVlHThZmZtaRk4WZmXX0/wFZfduL32Si2AAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from matplotlib import pyplot as plt\n", + "\n", + "df = pd.read_csv('olympics-data/olympics_sections.csv')\n", + "df[['tokens']].hist()\n", + "# add axis descriptions and title\n", + "plt.xlabel('Number of tokens')\n", + "plt.ylabel('Number of Wikipedia sections')\n", + "plt.title('Distribution of number of tokens in Wikipedia sections')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the majority of section are fairly short (less than 500 tokens)." + ] + } + ], + "metadata": { + "interpreter": { + "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" + }, + "kernelspec": { + "display_name": "Python 3.7.3 64-bit ('base': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/fine-tuned_qa/olympics-2-create-qa.ipynb b/examples/fine-tuned_qa/olympics-2-create-qa.ipynb new file mode 100644 index 00000000..7b279f05 --- /dev/null +++ b/examples/fine-tuned_qa/olympics-2-create-qa.ipynb @@ -0,0 +1,756 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Creating a synthetic Q&A dataset\n", + "We use [`davinci-instruct-beta-v2`](https://beta.openai.com/docs/engines/instruct-series-beta), a model specialized in following instructions, to create questions based on the given context. Then we also use [`davinci-instruct-beta-v2`](https://beta.openai.com/docs/engines/instruct-series-beta) to answer those questions, given the same context. \n", + "\n", + "This is expensive, and will also take a long time, as we call the davinci engine for each section. You can simply download the final dataset instead.\n", + "\n", + "We're using the dataset created using the [previous notebook](olympics-1-collect-data.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Read in the data, and create a context\n", + "Create a context by concatenating the title, the heading and the content of that section" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleheadingcontenttokenscontext
02020 Summer OlympicsSummaryThe 2020 Summer Olympics (Japanese: 2020年夏季オリン...7132020 Summer Olympics\\nSummary\\n\\nThe 2020 Summ...
12020 Summer OlympicsHost city selectionThe International Olympic Committee (IOC) vote...1262020 Summer Olympics\\nHost city selection\\n\\nT...
22020 Summer OlympicsImpact of the COVID-19 pandemicIn January 2020, concerns were raised about th...3692020 Summer Olympics\\nImpact of the COVID-19 p...
32020 Summer OlympicsQualifying event cancellation and postponementConcerns about the pandemic began to affect qu...2982020 Summer Olympics\\nQualifying event cancell...
42020 Summer OlympicsEffect on doping testsMandatory doping tests were being severely res...1632020 Summer Olympics\\nEffect on doping tests\\n...
\n", + "
" + ], + "text/plain": [ + " title heading \\\n", + "0 2020 Summer Olympics Summary \n", + "1 2020 Summer Olympics Host city selection \n", + "2 2020 Summer Olympics Impact of the COVID-19 pandemic \n", + "3 2020 Summer Olympics Qualifying event cancellation and postponement \n", + "4 2020 Summer Olympics Effect on doping tests \n", + "\n", + " content tokens \\\n", + "0 The 2020 Summer Olympics (Japanese: 2020年夏季オリン... 713 \n", + "1 The International Olympic Committee (IOC) vote... 126 \n", + "2 In January 2020, concerns were raised about th... 369 \n", + "3 Concerns about the pandemic began to affect qu... 298 \n", + "4 Mandatory doping tests were being severely res... 163 \n", + "\n", + " context \n", + "0 2020 Summer Olympics\\nSummary\\n\\nThe 2020 Summ... \n", + "1 2020 Summer Olympics\\nHost city selection\\n\\nT... \n", + "2 2020 Summer Olympics\\nImpact of the COVID-19 p... \n", + "3 2020 Summer Olympics\\nQualifying event cancell... \n", + "4 2020 Summer Olympics\\nEffect on doping tests\\n... " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df = pd.read_csv('olympics-data/olympics_sections.csv')\n", + "df['context'] = df.title + \"\\n\" + df.heading + \"\\n\\n\" + df.content\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Create questions based on the context\n", + "Use davinci-instruct to generate a number of plausible questions relating to the Wikipedia section contents.\n", + "\n", + "Note: We have used temperature=0, but it may be beneficial to experiment with a higher temperature to get a higher diversity of questions.\n", + "\n", + "**WARNING: This step will last a long time, and consume a lot of tokens, as it calls davinci-instruct for every section to generate a number of questions.**" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. What is the 2020 Summer Olympics?\n", + "2. When did the 2020 Summer Olympics take place?\n", + "3. Who won the most medals at the 2020 Summer Olympics?\n", + "4. Who won the most gold medals at the 2020 Summer Olympics?\n", + "5. Who won the most medals at the 2020 Summer Olympics?\n" + ] + } + ], + "source": [ + "import openai\n", + "\n", + "def get_questions(context):\n", + " try:\n", + " response = openai.Completion.create(\n", + " engine=\"davinci-instruct-beta-v2\",\n", + " prompt=f\"Write questions based on the text below\\n\\nText: {context}\\n\\nQuestions:\\n1.\",\n", + " temperature=0,\n", + " max_tokens=257,\n", + " top_p=1,\n", + " frequency_penalty=0,\n", + " presence_penalty=0,\n", + " stop=[\"\\n\\n\"]\n", + " )\n", + " return response['choices'][0]['text']\n", + " except:\n", + " return \"\"\n", + "\n", + "\n", + "df['questions']= df.context.apply(get_questions)\n", + "df['questions'] = \"1.\" + df.questions\n", + "print(df[['questions']].values[0][0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The prompt is designed to generate a number of questions. Example questions above were generated based on the summary section of the 2020 Summer Olympics page.\n", + "\n", + "We can observe that the questions 3 and 5 above repeat. Sometimes the generated questions could be ambiguous without the context. We will show that even despite these limitations we can create a successful model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The 2020 Summer Olympics (Japanese: 2020年夏季オリンピック, Hepburn: Nisen Nijū-nen Kaki Orinpikku), officially the Games of the XXXII Olympiad (第三十二回オリンピック競技大会, Dai Sanjūni-kai Orinpikku Kyōgi Taikai) and branded as Tokyo 2020 (東京2020, Tōkyō Nii Zero Nii Zero), was an international multi-sport event held from 23 July to 8 August 2021 in Tokyo, Japan, with some preliminary events that began on 21 July.\n", + "Tokyo was selected as the host city during the 125th IOC Session in Buenos Aires, Argentina, on 7 September 2013. Originally scheduled to take place from 24 July to 9 August 2020, the event was postponed to 2021 in March 2020 as a result of the COVID-19 pandemic, the first such instance in the history of the Olympic Games (previous games had been cancelled but not rescheduled). However, the event retained the Tokyo 2020 name for marketing and branding purposes. It was largely held behind closed doors with no public spectators permitted due to the declaration of a state of emergency in the Greater Tokyo Area in response to the pandemic. The Summer Paralympics were held between 24 August and 5 September 2021, 16 days after the completion of the Olympics.The 2020 Games were the fourth Olympic Games to be held in Japan, following the Tokyo 1964 (Summer), Sapporo 1972 (Winter) and Nagano 1998 (Winter) games. Tokyo is the first city in Asia to hold the Summer Games twice. The 2020 Games were the second of three consecutive Olympics to be held in East Asia, following the 2018 Winter Olympics in Pyeongchang, South Korea and preceding the 2022 Winter Olympics in Beijing, China.\n", + "New events were introduced in existing sports for 2020, including 3x3 basketball, freestyle BMX and mixed gender team events in a number of existing sports, as well as the return of madison cycling for men and an introduction of the same event for women. New IOC policies also allowed the host organizing committee to add new sports to the Olympic program for just one Games. The disciplines added by the Japanese Olympic Committee were baseball and softball, karate, sport climbing, surfing and skateboarding, the last four of which made their Olympic debuts, and the last three of which will remain on the Olympic program.The United States topped the medal count by both total golds (39) and total medals (113), with China finishing second by both respects (38 and 88). Host nation Japan finished third, setting a record for the most gold medals and total medals ever won by their delegation at an Olympic Games with 27 and 58. Great Britain finished fourth, with a total of 22 gold and 65 medals, becoming the first nation at the Summer Olympics to increase or equal their total medals won in the two Games subsequent to hosting them. The Russian delegation competing as the ROC (not to be confused with the Republic of China (Taiwan) which competed as Chinese Taipei, not ROC) finished fifth with 20 gold medals and third in the overall medal count, with 71 medals. Bermuda, the Philippines and Qatar won their first-ever Olympic gold medals. Burkina Faso, San Marino and Turkmenistan won their first-ever Olympic medals.\n" + ] + } + ], + "source": [ + "print(df.content.values[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.3 Create answers based on the context\n", + "Use davinci-instruct to answer the questions given the relevant Wikipedia section contents\n", + "\n", + "Note: We have used temperature=0, but it may be beneficial to experiment with a higher temperature to get a higher diversity of questions.\n", + "\n", + "**WARNING: This step will last a long time, and consume a lot of tokens, as it calls davinci-instruct for every section to answer all the questions.**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. The 2020 Summer Olympics is an international multi-sport event held from 23 July to 8 August 2021 in Tokyo, Japan.\n", + "2. The 2020 Summer Olympics took place from 23 July to 8 August 2021.\n", + "3. The United States topped the medal count by both total golds (39) and total medals (113), with China finishing second by both respects (38 and 88).\n", + "4. The United States topped the medal count by both total golds (39) and total medals (113), with China finishing second by both respects (38 and 88).\n", + "5. The United States topped the medal count by both total golds (39) and total medals (113), with China finishing second by both respects (38 and 88).\n" + ] + } + ], + "source": [ + "def get_answers(row):\n", + " try:\n", + " response = openai.Completion.create(\n", + " engine=\"davinci-instruct-beta-v2\",\n", + " prompt=f\"Write questions based on the text below\\n\\nText: {row.context}\\n\\nQuestions:\\n{row.questions}\\n\\nAnswers:\\n1.\",\n", + " temperature=0,\n", + " max_tokens=257,\n", + " top_p=1,\n", + " frequency_penalty=0,\n", + " presence_penalty=0\n", + " )\n", + " return response['choices'][0]['text']\n", + " except Exception as e:\n", + " print (e)\n", + " return \"\"\n", + "\n", + "\n", + "df['answers']= df.apply(get_answers, axis=1)\n", + "df['answers'] = \"1.\" + df.answers\n", + "df = df.dropna().reset_index().drop('index',axis=1)\n", + "print(df[['answers']].values[0][0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These are the answers to the questions above based on the context around the host city selection. \n", + "\n", + "We can see that answers 3-5 contain the correct answer, but instead of answering the question directly, the answer is a verbatim extraction. Despite these occasional lower quality answers, we will show that the model can learn the task reasonably well, given a high number of examples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.4 Save the Olympics Q&A dataset based on Wikipedia sections\n", + "We save the file for use in the [next notebook](olympics-3-train-qa.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('olympics-data/olympics_qa.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.5 Search file (DEPRECATED)\n", + "We create a search file ([API reference](https://beta.openai.com/docs/api-reference/files/list)), which can be used to retrieve the relevant context when a question is asked.\n", + "\n", + "**DEPRECATED: The /search endpoint is deprecated in favour of using embeddings. Embeddings are cheaper, faster and can support a better search experience. See [Question Answering Guide](https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb) for a search implementation using the embeddings**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df.tokens<2000]\n", + "df[['context', 'tokens']].rename(columns={'context':'text','tokens':'metadata'}).to_json('olympics-data/olympics_search.jsonl', orient='records', lines=True)\n", + "\n", + "search_file = openai.File.create(\n", + " file=open(\"olympics-data/olympics_search.jsonl\"),\n", + " purpose='search'\n", + ")\n", + "olympics_search_fileid = search_file['id']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.6 Answer questions based on the context provided\n", + "\n", + "We will use a simple implementation of the answers endpoint. This works by simply using the [/search endpoint](https://beta.openai.com/docs/api-reference/searches), which searches over an indexed file to obtain the relevant sections which can be included in the context, following by a question and answering prompt given a specified model." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Athletics at the 2020 Summer Olympics – Women's 4 × 100 metres relay\n", + "Summary\n", + "\n", + "The women's 4 × 100 metres relay event at the 2020 Summer Olympics took place on 5 and 6 August 2021 at the Japan National Stadium. There were 16 competing relay teams, with each team having 5 members from which 4 were selected in each round.\n", + "\n", + "###\n", + "\n", + "Athletics at the 2020 Summer Olympics – Men's 4 × 100 metres relay\n", + "Qualification\n", + "\n", + "National Olympic Committees (NOCs) could qualify one relay team in one of three following ways:\n", + "The top 8 NOCs at the 2019 World Athletics Championships qualified a relay team.\n", + "The top 8 NOCs at the 2021 World Athletics Relays qualified a relay team.\n", + "Where an NOC placed in the top 8 at both the 2019 World Championships and the 2021 World Relays, the quota place was allocated to the world top list as of 29 June 2021. In this case, 4 teams did so, so there are 4 places available through the world rankings.A total of five athletes may be entered for a relay team. Should a NOC have also entered individual athletes in the corresponding individual event (100 m), the entered individual athletes must be included in the total of five (5) athletes entered for the relay event. In addition of five, NOCs can nominate a maximum of one alternate athlete for each team.\n", + "The qualifying period was originally from 1 May 2019 to 29 June 2020. Due to the COVID-19 pandemic, the period was suspended from 6 April 2020 to 30 November 2020, with the end date extended to 29 June 2021. The qualifying time standards could be obtained in various meets during the given period that have the approval of the IAAF. Both indoor and outdoor meets are eligible. The most recent Area Championships may be counted in the ranking, even if not during the qualifying period.\n" + ] + } + ], + "source": [ + "from answers_with_ft import create_context, answer_question\n", + "print(create_context(\"Where did women's 4 x 100 metres relay event take place during the 2020 Summer Olympics?\", olympics_search_fileid, max_len=400))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' Japan National Stadium'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer_question(olympics_search_fileid, \"davinci-instruct-beta-v2\", \n", + " \"Where did women's 4 x 100 metres relay event take place during the 2020 Summer Olympics?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After we fine-tune the model for Q&A we'll be able to use it instead of [`davinci-instruct-beta-v2`](https://beta.openai.com/docs/engines/instruct-series-beta), to obtain better answers when the question can't be answered based on the context. We see a downside of [`davinci-instruct-beta-v2`](https://beta.openai.com/docs/engines/instruct-series-beta), which always attempts to answer the question, regardless of the relevant context being present or not. (Note the second question is asking about a future event, set in 2024.)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' Japan National Stadium'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer_question(olympics_search_fileid, \"davinci-instruct-beta-v2\", \n", + " \"Where did women's 4 x 100 metres relay event take place during the 2048 Summer Olympics?\", max_len=1000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that davinci has a tendency to answer the question, even if the question can't be answered given the context provided. Note the question asked regarding 2048 Summer Olympics, which didn't happen yet, and the retrieved content has only returned results for 2020." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.7 (Optional) Investigation into how likely the search endpoint is to return the relevant context" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 58)\n" + ] + } + ], + "source": [ + "def check_context(title, heading, question, max_len=1800, search_model='ada', max_rerank=10):\n", + " \"\"\"\n", + " Evaluate the performance of the search model in retrieving the correct context\n", + "\n", + " Parameters\n", + " ----------\n", + " title: str\n", + " The title of the Wikipedia page\n", + " heading: str\n", + " The heading of the Wikipedia section\n", + " qusetion: str\n", + " The question\n", + " max_len: int\n", + " The maximum length of the context\n", + " search_model: str\n", + " The search model to use - `ada` is most cost effective\n", + " max_rerank: int\n", + " The maximum number of reranking documents to use the search model on\n", + "\n", + " Returns\n", + " -------\n", + " rank: int\n", + " The rank of the correct context\n", + " token_length: int\n", + " The number of tokens needed to obtain the correct context\n", + " \"\"\"\n", + " \n", + " try:\n", + " results = openai.Engine(search_model).search(\n", + " search_model=search_model, \n", + " query=question, \n", + " max_rerank=max_rerank,\n", + " file=olympics_search_fileid,\n", + " return_metadata=True\n", + " )\n", + " index=-1\n", + " returns = []\n", + " cur_len = 0\n", + " for result in results['data']:\n", + " cur_len += int(result['metadata']) + 4 # we add 4 tokens for the separator `\\n\\n###\\n\\n`\n", + " if cur_len > max_len:\n", + " break\n", + " returns.append(result['text'])\n", + " res = result['text'].split('\\n')\n", + " if res[0] == title and res[1] == heading:\n", + " index = len(returns) - 1\n", + " break\n", + " return index, cur_len\n", + " except Exception as e:\n", + " #print (e)\n", + " return []\n", + "print(check_context(\"Athletics at the 2020 Summer Olympics – Women's 4 × 100 metres relay\", \"Summary\", \"Where did women's 4 x 100 metres relay event take place during the 2020 Summer Olympics?\", max_len=10000))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We utilize the generated questions based on context to estimate how often we can retrieve the original context. These questions are noisy, so this is not a perfect estimate.\n", + "\n", + "Our questions and answers are prefixed with numbered bullet points, however due to the way they were generated, they are missing the first number, hence we add \"1.\" to the list of questions (and answers).\n", + "\n", + "We calculate the rank of the section retrieved using ada search, and the number of tokens in the context needed to retrieve the relevant section in full." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [(132, 27104), (-1, 22939), (8, 2151), (2, 121...\n", + "1 [(4, 1737), (0, 130), (8, 744), (96, 17208), (...\n", + "2 [(0, 373), (0, 373), (-1, 40610), (1, 570)]\n", + "3 [(0, 302), (0, 302), (5, 968), (8, 1425)]\n", + "4 [(0, 167), (0, 167), (2, 1442)]\n", + "Name: ada, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ada_results = df.apply(lambda x: [\n", + " check_context( x.title, \n", + " x.heading, \n", + " q[3:], # remove the number prefix\n", + " max_len=1000000, # set a large number to get the full context \n", + " search_model='ada', \n", + " max_rerank=200,\n", + " ) \n", + " for q in (x.questions).split('\\n') # split the questions\n", + " if len(q) >10 # remove the empty questions\n", + " ], axis=1)\n", + "ada_results.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "out = pd.concat([ada_results], axis=1)\n", + "out.columns = ['ada']\n", + "out.to_csv('olympics-data/search_engine_results.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def expand_lists(out):\n", + " \"\"\"\n", + " Expand a pandas series containing lists into a series, where each list element becomes a value on its own\n", + "\n", + " Input is a row per paragraph, which has multiple questions\n", + " Output is a row per question\n", + " \"\"\"\n", + " cols = [pd.DataFrame(out[name].tolist()).stack().reset_index(level=1, drop=True).rename(name) for name in out.columns] \n", + " return pd.concat(cols, axis=1)\n", + "\n", + "out_expanded = expand_lists(out)\n", + "out_expanded['rank'] = out_expanded.ada.apply(lambda x: x[0] if x != [] else -2)\n", + "out_expanded['tokens'] = out_expanded.ada.apply(lambda x: x[1] if x != [] else -2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "74.3% of relevant paragraphs are retrieved within the first 2k tokens\n" + ] + } + ], + "source": [ + "within_2k = (out_expanded.tokens < 2000).mean()\n", + "print(f\"{within_2k*100:.1f}% of relevant paragraphs are retrieved within the first 2k tokens\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The relevant context can be obtained 74% of the time on this dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.4% of relevant paragraphs are not retrieved within the first 200 results\n" + ] + } + ], + "source": [ + "outside_200 = (out_expanded['rank'] == -1).mean()\n", + "print(f\"{outside_200*100:.1f}% of relevant paragraphs are not retrieved within the first 200 results\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "7.4% of the time, this is due to the keyword search part of the search algorithm not retrieving the relevant context within the first 200 results.\n", + "18.3% of the time this is due to the semantic search not placing the relevant context within the first 2000 tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEWCAYAAAB1xKBvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAhl0lEQVR4nO3df5wddX3v8dcbAoIJkEToFpJAUCIUiyLu5UeldgENAbWJXuRCQQLFG63RB97CLWDV8NOLfYCItKKpRBIbiCmKyUWvmEZWijUIASSESBMgMYn5IWwSWH7Z4Of+Md81w2Z/fPfsnrN7Tt7Px+M8duY735n5fmZmz+fMd+bMUURgZmbWm90GuwFmZlYfnDDMzCyLE4aZmWVxwjAzsyxOGGZmlsUJw8zMsjhhDCGSlktqGex2DCZJH5K0VlK7pHfWYH2rJb23yuu4RtKzkjZWcz2d1nmOpB/Xan2l9VZ9ew5FklokrRvsdlSbE0aNdPWPJOl8Sfd3jEfE2yKitZfljJcUkoZVqamD7XrgUxExIiIeGezG9Jekg4GLgSMj4o8HYHlZ+z8i5kbExP6uz6zMCcNeZwgkokOA5TkVh0BbcxwMPBcRm3MqD0RMdbJd+mWgY9wVttlAcMIYQspnIZKOlfSQpOclbZL05VTtvvR3a+q2OUHSbpI+J2mNpM2S5kjar7Tc89K05yR9vtN6rpB0p6R/kfQ8cH5a988lbZW0QdI/StqztLyQ9ElJKyW9IOlqSW+R9B+pvfPL9TvF2GVbJb1BUjuwO/BLSU91M39Imi5pJbAyld2UurGel7RU0p+X6l+R2jMntXW5pOZulv0nkp6RdHYav1TS+jTfk5JO6Wa+/dLyf5vi+lyK873AIuCgtK9u62LeFknr0ro2At9K814m6am0z+ZLGt3D/j9f0s8k3SjpOeCKzmevko6QtEhSW4rlzFR+nKSNknYv1f2QpMdK+6u7tiDpo6Vj6++72j6lurdJ+npqxwuSfirpkNL03vZjX4/TiSnWbZK+ltb3sTStq232Fkk/SbE8K2mupJGl5a2WdLmkJyRtkfQtSXt1ivHidFxvkHRBqfz0NN8L6Zi6pKdtNWRFhF81eAGrgfd2KjsfuL+rOsDPgY+m4RHA8Wl4PBDAsNJ8fw2sAt6c6n4P+HaadiTQDpwI7EnR5fNfpfVckcanUHyA2Bt4F3A8MCytbwXwmdL6AlgA7Au8DXgVWJzWvx/wBDC1m+3QbVtLyz6sh+0YFG/Co4G9U9m5wJtSey8GNgJ7leJ7BTidIhn9H2BJ520OHAP8GvhAKj8cWAscVNrub+mmTXPS9tgn1ftP4MI0rQVY10M8LcB24EvAG9L2vwhYAoxNZd8A7uhh/5+flvHptA32pnRsAcNTLBek6e8EnqXoJgN4CnhfaXn/ClyWhntqS8ex9Z407cupHe/tJtbbgBdK9W/i9cd/b/sx+zgF9geeBz6cpl+U5v9YD9vsMOB9qW0HUCTnr3Q6Vh4HxlEcfz8Drum0H68C9qA43l4CRqXpG4A/T8OjgGMG+z2povexwW7ArvJKB1s7sLX0eonuE8Z9wJXA/p2WM56d3zAWA58sjR+e/jmGAV/o+AdP094I/I7XJ4z7emn7Z4C7SuMBvLs0vhS4tDR+Q/kfrdOyum1radm9JYyTe2nvFuAdpfj+rTTtSODlTtv8SmAd0FIqPwzYTJFM9uhhXbun7XlkqezjQGsabqH3hPE70htjKlsBnFIaP7C0P7va/+cDv+603PPZkTD+B/DvnaZ/A5iRhq8BZqXhfYAXgUMy2vIFYF5p2vDysdVFrLd1qj8CeA0Yl7kfs49T4Dzg56Vpokia5YTx616WNwV4pNOx8onS+OnAU6X9+HKn/bKZHR/0fp2Oi317WudQf7lLqramRMTIjhfwyR7qXgi8FfiVpAclfaCHugcBa0rjayj+oZvStLUdEyLiJeC5TvOvLY9Iequku1NXxfPAFyk+sZVtKg2/3MX4iAramqtzey+RtCJ1PWylOMspt7d8d9JLwF56fZ/1J4D/iNINBxGxiuIN6Apgs6R5kg7qoi37U3yi7BzTmD7E89uIeKU0fghwV+pq2Urxpv0aPW+jtT1MOwQ4rmN5aZnnAB0X4W8HPizpDRSfyB+OiDWlebtrS+dj60V2Pra6bWdEtANtaTk5+7Evx2nntgXFh4Iu25KW15T28/q0vH9h5+O+PM+ajrYnz0XE9tL4S+z4P/jvFAlmTeoaO4E65IQxREXEyog4G/gjiu6KOyUNp/h02dlvKP6xOxxMcXq8ieJUeGzHBEl7U5z2v251ncZvAX4FTIiIfYHPUnxCGwg9tTXXH9qb+rn/DjiT4vR/JLCNvrX3E8DBkm583Uoibo+IE1N7g2I/dPYsxSfuzjGt78P6O2//tcBp5Q8XEbFXRKzvom53y+i8vJ92Wt6IiPgbgIh4guLN7zTgrygSSE5bNlB0zwAg6Y3sfGx1Vq4/gqJr5zeZ+7Evx2nn417l8W6W98VUdlRa3rnsfByNKw0fTHE89yoiHoyIyRT/z98H5ufMN9Q4YQxRks6VdEBE/J6i+wrg98Bv0983l6rfAfwvSYemf8IvAt9Jn3buBD4o6c/SBcEr6P3NdB+K/t92SUcAfzNAYfXW1krsQ5FwfgsMk/QFimsrffECMAl4j6TrACQdLunk9Kn7FYqzpt93njEiXqP4579W0j7pIu7fUnw6rdTX0/IOSW05QNLkNK2r/d+bu4G3qrhAvUd6/TdJf1KqcztFP/97KK5h5LTlTuADkk5Mx9ZV9P6ecnqp/tUU15PWUtl+7Ok4/QFwlKQp6WxyOjvOqHpaXjuwTdIY4H93UWe6pLEqLvz/PfCdXpaJpD1VfC9mv4j4r9TmnY6leuCEMXRNAparuHPoJuCsiHg5dSldC/wsdRMcD8wCvk1x3eMZije4TwNExPI0PI/iU1c7Rd/qqz2s+xKKT5ovAP9Mxj9FH3Tb1grdA/yI4kLzmrS8nrpnuhQRWykueJ4m6WqKC5/XUZxBbKT4ZHh5N7N/mqLf/2ngfoo331l9bUPJTcBC4MeSXqC46HxcamdX+79HEfECMBE4i+IT8UZ2XGTvcAfwF8BPIuLZzLYsp3gjvp3i2NrCzt0+nd0OzKDoinoXxad4qGw/dnucphg+AvwDRTfZkcBD9HzcX0lx88M2ioTzvW7a/2OKff0UxfWfHB8FVqeurk9QdAnWHaULMraLSJ/qt1Kcxj8zyM2xXYiK24rXRcTnBmHdu1Eks3Mi4t4Kl7Ga4qL5vw1k2+qJzzB2AZI+KOmN6RrI9cAyijs+zBqWpFMljUzdih3XN5YMcrPqmhPGrmEyRVfEb4AJFN1bPrW0RncCRbfRs8AHKe5SfHlwm1Tf3CVlZmZZfIZhZmZZGvKBW/vvv3+MHz++4vlffPFFhg8fPnANGmSOZ+hrtJgaLR5ovJi6imfp0qXPRsQB3c3TkAlj/PjxPPTQQxXP39raSktLy8A1aJA5nqGv0WJqtHig8WLqKh5Ja7quXXCXlJmZZXHCMDOzLE4YZmaWxQnDzMyyOGGYmVkWJwwzM8vihGFmZlmcMMzMLIsThpmZZWnIb3r317L12zj/sh/0Wm/1de+vQWvMzIYGn2GYmVkWJwwzM8tStYQh6XBJj5Zez0v6jKTRkhZJWpn+jkr1JemrklZJekzSMaVlTU31V0qaWq02m5lZ96qWMCLiyYg4OiKOpvix95eAu4DLgMURMQFYnMYBTqP4NbgJwDTgFgBJoyl+NP444FhgRkeSMTOz2qlVl9QpwFMRsYbi50Jnp/LZwJQ0PBmYE4UlwEhJBwKnAosioi0itgCLgEk1areZmSW1ukvqLOCONNwUERvS8EagKQ2PAdaW5lmXyrorfx1J0yjOTGhqaqK1tbXixjbtDRcftb3Xev1ZRy21t7fXTVtzNFo80HgxNVo80HgxVRJP1ROGpD2BvwQu7zwtIkLSgPyoeETMBGYCNDc3R39+6OTmuQu4YVnvm2b1OZWvo5Z2hR9+qXeNFlOjxQONF1Ml8dSiS+o04OGI2JTGN6WuJtLfzal8PTCuNN/YVNZduZmZ1VAtEsbZ7OiOAlgIdNzpNBVYUCo/L90tdTywLXVd3QNMlDQqXeyemMrMzKyGqtolJWk48D7g46Xi64D5ki4E1gBnpvIfAqcDqyjuqLoAICLaJF0NPJjqXRURbdVst5mZ7ayqCSMiXgTe1KnsOYq7pjrXDWB6N8uZBcyqRhvNzCyPv+ltZmZZnDDMzCyLE4aZmWVxwjAzsyxOGGZmlsUJw8zMsjhhmJlZFicMMzPL4oRhZmZZnDDMzCyLE4aZmWVxwjAzsyxOGGZmlsUJw8zMsjhhmJlZFicMMzPL4oRhZmZZnDDMzCyLE4aZmWWpasKQNFLSnZJ+JWmFpBMkjZa0SNLK9HdUqitJX5W0StJjko4pLWdqqr9S0tRqttnMzLpW7TOMm4AfRcQRwDuAFcBlwOKImAAsTuMApwET0msacAuApNHADOA44FhgRkeSMTOz2qlawpC0H/Ae4FaAiPhdRGwFJgOzU7XZwJQ0PBmYE4UlwEhJBwKnAosioi0itgCLgEnVareZmXWtmmcYhwK/Bb4l6RFJ35Q0HGiKiA2pzkagKQ2PAdaW5l+XyrorNzOzGhpW5WUfA3w6Ih6QdBM7up8AiIiQFAOxMknTKLqyaGpqorW1teJlNe0NFx+1vdd6/VlHLbW3t9dNW3M0WjzQeDE1WjzQeDFVEk81E8Y6YF1EPJDG76RIGJskHRgRG1KX0+Y0fT0wrjT/2FS2HmjpVN7aeWURMROYCdDc3BwtLS2dq2S7ee4CbljW+6ZZfU7l66il1tZW+rM9hppGiwcaL6ZGiwcaL6ZK4qlal1REbATWSjo8FZ0CPAEsBDrudJoKLEjDC4Hz0t1SxwPbUtfVPcBESaPSxe6JqczMzGqommcYAJ8G5kraE3gauIAiSc2XdCGwBjgz1f0hcDqwCngp1SUi2iRdDTyY6l0VEW1VbreZmXVS1YQREY8CzV1MOqWLugFM72Y5s4BZA9o4MzPrE3/T28zMsjhhmJlZFicMMzPL4oRhZmZZnDDMzCyLE4aZmWVxwjAzsyxOGGZmlsUJw8zMsjhhmJlZFicMMzPL4oRhZmZZnDDMzCyLE4aZmWVxwjAzsyxOGGZmlsUJw8zMsjhhmJlZFicMMzPL4oRhZmZZqpowJK2WtEzSo5IeSmWjJS2StDL9HZXKJemrklZJekzSMaXlTE31V0qaWs02m5lZ12pxhnFSRBwdEc1p/DJgcURMABancYDTgAnpNQ24BYoEA8wAjgOOBWZ0JBkzM6udweiSmgzMTsOzgSml8jlRWAKMlHQgcCqwKCLaImILsAiYVOM2m5nt8hQR1Vu49AywBQjgGxExU9LWiBiZpgvYEhEjJd0NXBcR96dpi4FLgRZgr4i4JpV/Hng5Iq7vtK5pFGcmNDU1vWvevHkVt3tz2zY2vdx7vaPG7FfxOmqpvb2dESNGDHYzBkyjxQONF1OjxQONF1NX8Zx00klLS71BOxlW5TadGBHrJf0RsEjSr8oTIyIkDUjGioiZwEyA5ubmaGlpqXhZN89dwA3Let80q8+pfB211NraSn+2x1DTaPFA48XUaPFA48VUSTxV7ZKKiPXp72bgLoprEJtSVxPp7+ZUfT0wrjT72FTWXbmZmdVQ1RKGpOGS9ukYBiYCjwMLgY47naYCC9LwQuC8dLfU8cC2iNgA3ANMlDQqXeyemMrMzKyGqtkl1QTcVVymYBhwe0T8SNKDwHxJFwJrgDNT/R8CpwOrgJeACwAiok3S1cCDqd5VEdFWxXabmVkXqpYwIuJp4B1dlD8HnNJFeQDTu1nWLGDWQLfRzMzy+ZveZmaWxQnDzMyyOGGYmVkWJwwzM8vihGFmZlmcMMzMLIsThpmZZXHCMDOzLE4YZmaWxQnDzMyyOGGYmVkWJwwzM8vihGFmZlmcMMzMLIsThpmZZXHCMDOzLE4YZmaWxQnDzMyyZCUMSYtzyszMrHH1+JvekvYC3gjsL2kUoDRpX2BMldtmZmZDSG9nGB8HlgJHpL8drwXAP+asQNLukh6RdHcaP1TSA5JWSfqOpD1T+RvS+Ko0fXxpGZen8iclndrnKM3MrN96TBgRcVNEHApcEhFvjohD0+sdEZGVMICLgBWl8S8BN0bEYcAW4MJUfiGwJZXfmOoh6UjgLOBtwCTga5J2z1y3mZkNkKxrGBFxs6Q/k/RXks7rePU2n6SxwPuBb6ZxAScDd6Yqs4EpaXhyGidNPyXVnwzMi4hXI+IZYBVwbFZ0ZmY2YHq8htFB0reBtwCPAq+l4gDm9DLrV4C/A/ZJ428CtkbE9jS+jh3XQsYAawEiYrukban+GGBJaZnlecptnAZMA2hqaqK1tTUntC417Q0XH7W913r9WUcttbe3101bczRaPNB4MTVaPNB4MVUST1bCAJqBIyMichcs6QPA5ohYKqmlT62qQETMBGYCNDc3R0tL5au8ee4CbljW+6ZZfU7l66il1tZW+rM9hppGiwcaL6ZGiwcaL6ZK4slNGI8Dfwxs6MOy3w38paTTgb0o7qy6CRgpaVg6yxgLrE/11wPjgHWShgH7Ac+VyjuU5zEzsxrJ/eLe/sATku6RtLDj1dMMEXF5RIyNiPEUF61/EhHnAPcCZ6RqUynuuAJYmMZJ03+SzmgWAmelu6gOBSYAv8hst5mZDZDcM4wrBnCdlwLzJF0DPALcmspvBb4taRXQRpFkiIjlkuYDTwDbgekR8drOizUzs2rKShgR8dP+rCQiWoHWNPw0XdzlFBGvAB/pZv5rgWv70wYzM+uf3LukXqC4KwpgT2AP4MWI2LdaDTMzs6El9wyj47ZYSt+NOL5ajTIzs6Gnz0+rjcL3AT+iw8xsF5LbJfXh0uhuFN/LeKUqLTIzsyEp9y6pD5aGtwOrKbqlzMxsF5F7DeOCajfEzMyGttwfUBor6S5Jm9Pru+nBgmZmtovIvej9LYpvXB+UXv83lZmZ2S4iN2EcEBHfiojt6XUbcEAV22VmZkNMbsJ4TtK56dfzdpd0LsWDAc3MbBeRmzD+GjgT2EjxxNozgPOr1CYzMxuCcm+rvQqYGhFbACSNBq6nSCRmZrYLyD3DeHtHsgCIiDbgndVpkpmZDUW5CWM3SaM6RtIZRu7ZiZmZNYDcN/0bgJ9L+tc0/hH8uHEzs11K7je950h6CDg5FX04Ip6oXrPMzGyoye5WSgnCScLMbBfV58ebm5nZrskJw8zMsjhhmJlZlqolDEl7SfqFpF9KWi7pylR+qKQHJK2S9B1Je6byN6TxVWn6+NKyLk/lT0ryL/2ZmQ2Cap5hvAqcHBHvAI4GJkk6HvgScGNEHAZsAS5M9S8EtqTyG1M9JB0JnAW8DZgEfE3S7lVst5mZdaFqCSP99nd7Gt0jvYLi1tw7U/lsYEoanpzGSdNPkaRUPi8iXo2IZ4BVwLHVareZmXWtqt/WTmcCS4HDgH8CngK2RsT2VGUdMCYNjwHWAkTEdknbgDel8iWlxZbnKa9rGjANoKmpidbW1orb3bQ3XHzU9l7r9WcdtdTe3l43bc3RaPFA48XUaPFA48VUSTxVTRgR8RpwtKSRwF3AEVVc10xgJkBzc3O0tLRUvKyb5y7ghmW9b5rV51S+jlpqbW2lP9tjqGm0eKDxYmq0eKDxYqoknprcJRURW4F7gROAkZI63o3HAuvT8HpgHECavh/Fb278obyLeczMrEaqeZfUAenMAkl7A+8DVlAkjjNStanAgjS8MI2Tpv8kIiKVn5XuojoUmAD8olrtNjOzrlWzS+pAYHa6jrEbMD8i7pb0BDBP0jXAI8Ctqf6twLclrQLaKO6MIiKWS5pP8ViS7cD01NVlZmY1VLWEERGP0cVvZkTE03Rxl1NEvELxFNyulnUtfjqumdmg8je9zcwsixOGmZllccIwM7MsThhmZpbFCcPMzLI4YZiZWRYnDDMzy+KEYWZmWZwwzMwsixOGmZllccIwM7MsThhmZpbFCcPMzLI4YZiZWRYnDDMzy+KEYWZmWZwwzMwsixOGmZllccIwM7MsVUsYksZJulfSE5KWS7oolY+WtEjSyvR3VCqXpK9KWiXpMUnHlJY1NdVfKWlqtdpsZmbdq+YZxnbg4og4EjgemC7pSOAyYHFETAAWp3GA04AJ6TUNuAWKBAPMAI4DjgVmdCQZMzOrnaoljIjYEBEPp+EXgBXAGGAyMDtVmw1MScOTgTlRWAKMlHQgcCqwKCLaImILsAiYVK12m5lZ1xQR1V+JNB64D/hT4NcRMTKVC9gSESMl3Q1cFxH3p2mLgUuBFmCviLgmlX8eeDkiru+0jmkUZyY0NTW9a968eRW3d3PbNja93Hu9o8bsV/E6aqm9vZ0RI0YMdjMGTKPFA40XU6PFA40XU1fxnHTSSUsjorm7eYZVu1GSRgDfBT4TEc8XOaIQESFpQDJWRMwEZgI0NzdHS0tLxcu6ee4CbljW+6ZZfU7l66il1tZW+rM9hppGiwcaL6ZGiwcaL6ZK4qnqXVKS9qBIFnMj4nupeFPqaiL93ZzK1wPjSrOPTWXdlZuZWQ1V8y4pAbcCKyLiy6VJC4GOO52mAgtK5eelu6WOB7ZFxAbgHmCipFHpYvfEVGZmZjVUzS6pdwMfBZZJejSVfRa4Dpgv6UJgDXBmmvZD4HRgFfAScAFARLRJuhp4MNW7KiLaqthuMzPrQtUSRrp4rW4mn9JF/QCmd7OsWcCsgWudmZn1lb/pbWZmWZwwzMwsixOGmZllccIwM7MsThhmZpbFCcPMzLI4YZiZWRYnDDMzy+KEYWZmWZwwzMwsixOGmZllqfrvYTSy8Zf9ILvu6uveX8WWmJlVn88wzMwsixOGmZllccIwM7MsThhmZpbFCcPMzLI4YZiZWRYnDDMzy+KEYWZmWaqWMCTNkrRZ0uOlstGSFklamf6OSuWS9FVJqyQ9JumY0jxTU/2VkqZWq71mZtazap5h3AZM6lR2GbA4IiYAi9M4wGnAhPSaBtwCRYIBZgDHAccCMzqSjJmZ1VbVEkZE3Ae0dSqeDMxOw7OBKaXyOVFYAoyUdCBwKrAoItoiYguwiJ2TkJmZ1UCtnyXVFBEb0vBGoCkNjwHWluqtS2Xdle9E0jSKsxOamppobW2tvJF7w8VHba94/q70pz391d7ePqjrH2iNFg80XkyNFg80XkyVxDNoDx+MiJAUA7i8mcBMgObm5mhpaal4WTfPXcANywZ206w+p2VAl9cXra2t9Gd7DDWNFg80XkyNFg80XkyVxFPru6Q2pa4m0t/NqXw9MK5Ub2wq667czMxqrNYJYyHQcafTVGBBqfy8dLfU8cC21HV1DzBR0qh0sXtiKjMzsxqrWpeUpDuAFmB/Seso7na6Dpgv6UJgDXBmqv5D4HRgFfAScAFARLRJuhp4MNW7KiI6X0g3M7MaqFrCiIizu5l0Shd1A5jezXJmAbMGsGlmZlYBf9PbzMyy+CdaayT351z9U65mNlT5DMPMzLI4YZiZWRYnDDMzy+KEYWZmWZwwzMwsixOGmZll8W21Q0zu7bfgW3DNrLZ8hmFmZlmcMMzMLIsThpmZZfE1jDqWe73jtknDq9wSM9sV+AzDzMyy+AxjF7Bs/TbO98MPzayffIZhZmZZfIZhr+PHsJtZd5wwrCL+gqHZrscJw6quL8klh+/6MhscThhWd6pxEd9nTGa9q5uEIWkScBOwO/DNiLhukJtkdWCgz26qtcyLj9qelQT7kqx8PapxDJUPNHWRMCTtDvwT8D5gHfCgpIUR8cTgtsysthotAVZLNc4sBzOpDxV1kTCAY4FVEfE0gKR5wGTACcPMdjKYb8T1lgT6QhEx2G3olaQzgEkR8bE0/lHguIj4VKnONGBaGj0ceLIfq9wfeLYf8w81jmfoa7SYGi0eaLyYuornkIg4oLsZ6uUMo1cRMROYORDLkvRQRDQPxLKGAscz9DVaTI0WDzReTJXEUy/f9F4PjCuNj01lZmZWI/WSMB4EJkg6VNKewFnAwkFuk5nZLqUuuqQiYrukTwH3UNxWOysilldxlQPStTWEOJ6hr9FiarR4oPFi6nM8dXHR28zMBl+9dEmZmdkgc8IwM7MsThglkiZJelLSKkmXDXZ7BoKk1ZKWSXpU0kOD3Z6+kjRL0mZJj5fKRktaJGll+jtqMNvYV93EdIWk9Wk/PSrp9MFsY19IGifpXklPSFou6aJUXpf7qYd46nkf7SXpF5J+mWK6MpUfKumB9J73nXRTUffL8TWMQnr8yH9SevwIcHa9P35E0mqgOSLq8gtHkt4DtANzIuJPU9k/AG0RcV1K7KMi4tLBbGdfdBPTFUB7RFw/mG2rhKQDgQMj4mFJ+wBLgSnA+dThfuohnjOp330kYHhEtEvaA7gfuAj4W+B7ETFP0teBX0bELd0tx2cYO/zh8SMR8Tug4/EjNogi4j6grVPxZGB2Gp5N8c9cN7qJqW5FxIaIeDgNvwCsAMZQp/uph3jqVhTa0+ge6RXAycCdqbzXfeSEscMYYG1pfB11fpAkAfxY0tL0+JRG0BQRG9LwRqBpMBszgD4l6bHUZVUX3TedSRoPvBN4gAbYT53igTreR5J2l/QosBlYBDwFbI2I7alKr+95ThiN78SIOAY4DZieukMaRhR9qo3Qr3oL8BbgaGADcMOgtqYCkkYA3wU+ExHPl6fV437qIp663kcR8VpEHE3xpIxjgSP6ugwnjB0a8vEjEbE+/d0M3EVxoNS7TamfuaO/efMgt6ffImJT+of+PfDP1Nl+Sv3i3wXmRsT3UnHd7qeu4qn3fdQhIrYC9wInACMldXyBu9f3PCeMHRru8SOShqeLdkgaDkwEHu95rrqwEJiahqcCCwaxLQOi4401+RB1tJ/SBdVbgRUR8eXSpLrcT93FU+f76ABJI9Pw3hQ396ygSBxnpGq97iPfJVWSbpP7CjseP3Lt4LaofyS9meKsAorHwNxebzFJugNooXgU8yZgBvB9YD5wMLAGODMi6uYicjcxtVB0dQSwGvh4qf9/SJN0IvDvwDLg96n4sxT9/nW3n3qI52zqdx+9neKi9u4UJwrzI+Kq9B4xDxgNPAKcGxGvdrscJwwzM8vhLikzM8vihGFmZlmcMMzMLIsThpmZZXHCMDOzLE4YZkNAehLqJYPdDrOeOGGYDTAV/L9lDccHtdkAkDQ+/ZbKHIpvAN8q6aHybw+keqslXSnpYRW/U7LT83wk/U9J/y99I9dsyBjWexUzyzQBmBoRSySNjoi29DsriyW9PSIeS/WejYhjJH0SuAT4WMcCJH2K4rENU3r6xq3ZYPAZhtnAWRMRS9LwmZIepnjcwtuAI0v1Oh7OtxQYXyo/j+Kpwmc4WdhQ5IRhNnBehOJnLynOHE6JiLcDPwD2KtXrSAav8fqz/GUUCWRs1VtqVgEnDLOBty9F8tgmqYnirCHHI8DHgYWSDqpW48wq5YRhNsAi4pcUb/6/Am4HftaHee+nODv5gaT9q9NCs8r4abVmZpbFZxhmZpbFCcPMzLI4YZiZWRYnDDMzy+KEYWZmWZwwzMwsixOGmZll+f/cJDsrqw+q3wAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# plot a histogram, and add axis descriptions and title\n", + "out_expanded[(out_expanded['rank'] >=0)&(out_expanded['rank'] <30)]['rank'].hist(bins=29)\n", + "plt.xlabel('rank')\n", + "plt.ylabel('count')\n", + "plt.title('Histogram of ranks of retrieved paragraphs')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfEklEQVR4nO3de7wdZX3v8c+XhIsmQEITIyZIEKKVS0XYAiptE7Eh4CXUAxqKkgCVWtEDPXokFC1R4Qj1gigqxoZyUwJFKSnqwQgEDiqgkVtCxGwgGEJIDrkAAaQN/vrHPFsmm7XWs/bae9baYX/fr9d67Zlnnpn5rWdmzW/PM7NmKSIwMzNrZJtOB2BmZoOfk4WZmWU5WZiZWZaThZmZZTlZmJlZlpOFmZllOVnUIWmppMmdjqOTJP21pJWSNkl6cxP1J0t6tB2xDSRJsyTd1sH1/72kNamd/6SfyzpO0k8Guu5gJykk7dXpOKomaY6kK9o9LwzRZCFphaR39irb4oAREftExKLMciamnXR4RaF22peAj0XEyIi4q/fEofIBrZKkbYGvAFNTO6/rz/Ii4rsRMXWg67ab963BZ0gmi63FIEhCuwNLOxzDVqWFbTYO2AG3sw1yThZ1lM8+JB0k6VeSnkrdBV9J1W5NfzemLoS3StpG0qclPSJpraTLJO1cWu7xado6SZ/ptZ45kq6RdIWkp4BZad2/kLRR0mpJF0rarrS8kPRRScslPS3p85L2lPTzFO/V5fq93mPNWCVtL2kTMAy4R9KDNebtee/3pPf+gdK0T6TlrZZ0Qql8e0lfkvS71I4XSXpFndhmSbot1d8g6WFJR9TaPqW2uyIN95zxnZC60TZI+oikt0i6N7XlhS9dpS6U9KSk30g6rDRhZ0nz0vtZJelsScNKcf5M0vmS1gFzaryX7SV9VdJj6fXVVPZ64IFUbaOkm2rM26f3ol5nyGnej6T9Y6Okb0hSg7pN7Uu95y3Nv1cavkTSNyX9OO0fP5P06vTeN6Q2rtm1WW/fkvRhSd2S1ktaIOk1deY/NLXV5DR+oqRlab03SNq9yfbZS9ItaZ94QtJVddbXs41mqti3n5B0Zmn6NpJmS3pQxef+akm7lKYfktp4o6R7VOr+lrRHiuFpSQuBMb3W3fK8fRYRQ+4FrADe2atsFnBbrTrAL4APpeGRwCFpeCIQwPDSfCcC3cDrUt0fAJenaXsDm4BDge0ounn+q7SeOWn8KIpE/grgQOAQYHha3zLgtNL6ArgO2AnYB3geuDGtf2fgfmBmnXaoG2tp2Xs1aMctpgOTgc3A54BtgSOBZ4HRafr5wAJgF2BH4D+AL9RZ9qzUFh+mSFp/DzwGqNY2TG13Ra/tchHFf+1Tgd8D/w68ChgPrAX+srSuzcA/pLg/ADwJ7JKmXwt8GxiR5r8T+Lte8348baNX1HgvnwNuT/OOBX4OfL7ePtRr3lbey229ttH1wCjgtcD/B6Y1qNvUvtR73t77A3AJ8ATF/rsDcBPwMHB82p5nAzf3Yd96R1reAcD2wNeBW3vXB6YBK4GDUvl0in38jWn7fBr4eZPtcyVwJsVncQfg0Mw2+g7FZ/ZNqe3emKafmrb/hBT7t4Er07TxwDqKz8o2wF+l8bGlY89X0nx/ATzNi/t5y/O2dNys6oA8mF8UB5pNwMbS61nqJ4tbgc8CY+rsJOVkcSPw0dL4GygOesOBf+rZSdK0VwL/yZbJ4tZM7KcB1/ba2d9eGl8MnF4a/zLw1TrLqhtrrQ9sjflrJYvnerXHWopkJ+AZYM/StLcCD9dZ9iygu1dbBfDq3tun1Ha9k8X40vR1wAdK498nJd20rj8molR2J/Ahim6i5yklAeBY0oEuzfu7zDZ7EDiyNH44sKLePlRnH+vLe+mdAA4tjV8NzG5Qt6l9qfe8vfcHimTxndK0jwPLSuP7ARv7sG/NA/65ND6SYl+dWKp/BvAIsG+p3o+Bk0rj21B81ndvon0uA+YCEzLbt2cbTSiV3QnMSMPLgMNK03blxWPC6ZT+QUvTbwBmUiSvzcCI0rTv8eJ+3vK8rbyGcjfUURExqucFfLRB3ZOA1wO/kfRLSe9uUPc1FDtsj0codopxadrKngkR8SzFB79sZXlE0uslXS/pcRVdU/+Hl55OrikNP1djfGQLsbZqXURsLo0/m9Y/luKAvzidMm8E/m8qr+fxnoHUVlD/vdTSl3ZZFekTlTxC0T67U5xtrC7F/W2K/+p7bLHNaqjVzjW7UBpodRtDqR15cXtUsZ4ql7VFG0bEJorPzvhSndOAqyNiSalsd+CC0rZbT/GPS3m+eu3zqVT3ThV3R56YibHecnYHri3FsAx4geJztjtwTM+0NP1QioTyGmBDRDxTWm55P+rPvH3W6QuoW4WIWA4cK2kb4H3ANSpucYwa1R+j2Ig9ejL8GmA1xX/vAKjor+99q2TvZX4LuAs4NiKelnQacHTr76bpWAfaExQHiH0iYtUALO8ZiuTT49X9XN54SSoljNdSdJmtpDizGNMrCZbV2g/Ketq55yL2a1PZ1myL9pfU3/bP2WJflTSC4rNT3peOAeZJejQiLkhlK4FzIuK7fV1hRDxO0Q2KpEOBn0q6NSK6+7iolcCJEfGz3hMkraQ4O/hwjWm7A6MljSgd9F/Li/tbf+bts6F8ZtE0SR+UNDYi/kDRZQXwB4r+zT9Q9On2uBL4h3RxaSTFmcBV6UBzDfAeSW9TcaFwDsV/Lo3sCDwFbJL0pxR99wOlUazNWMOW772u1HbfAc6X9CoASeMlHd5C3AB3AzMkbSupi/4n0FcB/zMt7xiKPu4fRcRq4CfAlyXtlC5W7inpL/uw7CuBT0saK2kMRXdky/e7DxL3APtI2l/SDtS4sN9PvfetK4ET0vq2p9hX74iIFaU6jwGHAadK6vmcXAScIWkf+OPNCsc0E4CkYyRNSKMbKA60f2jhvVwEnJMO4KT9YHqadgXFMeFwScMk7aDi+0oTIuIR4FfAZyVtlxLWe0rL7c+8feZk0ZxpwFIVdwhdQNEX+VzqGjkH+Fk6DTwEuBi4nOI6x8MUFyM/DhARS9PwfIqzjE0UffrPN1j3J4G/obg49R2g5h0ZLaoba5PmAJem9/7+JuqfTnGx8fbUpfZTSmdaffQZYE+KD/FnKfpj++MOYBLFGdA5wNHx4ncejqe4IeH+tL5rKE71m3U2xQf3XuA+4NepbKsVEb+luHD/U2A5MNBfapxDad+KiJ9SbPPvU3x29gRm1IjrdxQJY7akv42Ia4HzgPlpn1sCHNF7vjreAtyRPvcLgFMj4qEW3ssFaf6fSHqa4mL3wSnelRQX4f+R4p/PlcD/5sVj89+kuuuBsyiuo9DfeVuhiJbPSqyf0n/zG4FJEfFwh8MxM6vLZxZtJuk9kl6Z+ly/RPGf5orORmVm1piTRftNp+hbfYyi22NG+PTOzAY5d0OZmVmWzyzMzCzrZfk9izFjxsTEiRMb1nnmmWcYMWJEewLqg8EaFzi2Vjm21ji21vQntsWLFz8REbW/KNvqV78H8+vAAw+MnJtvvjlbpxMGa1wRjq1Vjq01jq01/YkN+FX4cR9mZtYqJwszM8tysjAzsywnCzMzy3KyMDOzLCcLMzPLcrIwM7MsJwszM8tysjAzs6yX5eM++mvi7B82VW/Fue+qOBIzs8HBZxZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaW5WRhZmZZThZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaWVXmykDRM0l2Srk/je0i6Q1K3pKskbZfKt0/j3Wn6xNIyzkjlD0g6vOqYzcxsS+04szgVWFYaPw84PyL2AjYAJ6Xyk4ANqfz8VA9JewMzgH2AacA3JQ1rQ9xmZpZUmiwkTQDeBfxLGhfwDuCaVOVS4Kg0PD2Nk6YflupPB+ZHxPMR8TDQDRxUZdxmZrYlRUR1C5euAb4A7Ah8EpgF3J7OHpC0G/DjiNhX0hJgWkQ8mqY9CBwMzEnzXJHK56V5rum1rpOBkwHGjRt34Pz58xvGtmnTJkaOHFlz2n2rnmzq/e03fuem6vVFo7g6zbG1xrG1xrG1pj+xTZkyZXFEdNWaVtnvWUh6N7A2IhZLmlzVenpExFxgLkBXV1dMntx4lYsWLaJenVnN/p7FcY3X0YpGcXWaY2uNY2uNY2tNVbFV+eNHbwfeK+lIYAdgJ+ACYJSk4RGxGZgArEr1VwG7AY9KGg7sDKwrlfcoz2NmZm1Q2TWLiDgjIiZExESKC9Q3RcRxwM3A0anaTOC6NLwgjZOm3xRFH9kCYEa6W2oPYBJwZ1Vxm5nZS3XiZ1VPB+ZLOhu4C5iXyucBl0vqBtZTJBgiYqmkq4H7gc3AKRHxQvvDNjMbutqSLCJiEbAoDT9EjbuZIuL3wDF15j8HOKe6CM3MrBF/g9vMzLKcLMzMLMvJwszMspwszMwsy8nCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMspwszMwsy8nCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMspwszMwsy8nCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMspwszMwsy8nCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMspwszMwsy8nCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMspwszMwsy8nCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMsipLFpJ2kHSnpHskLZX02VS+h6Q7JHVLukrSdql8+zTenaZPLC3rjFT+gKTDq4rZzMxqq/LM4nngHRHxJmB/YJqkQ4DzgPMjYi9gA3BSqn8SsCGVn5/qIWlvYAawDzAN+KakYRXGbWZmvVSWLKKwKY1um14BvAO4JpVfChyVhqencdL0wyQplc+PiOcj4mGgGzioqrjNzOylFBHVLbw4A1gM7AV8A/gicHs6e0DSbsCPI2JfSUuAaRHxaJr2IHAwMCfNc0Uqn5fmuabXuk4GTgYYN27cgfPnz28Y26ZNmxg5cmTNafeterKp97ff+J2bqtcXjeLqNMfWGsfWGsfWmv7ENmXKlMUR0VVr2vB+RZURES8A+0saBVwL/GmF65oLzAXo6uqKyZMnN6y/aNEi6tWZNfuHTa1zxXGN19GKRnF1mmNrjWNrjWNrTVWxteVuqIjYCNwMvBUYJaknSU0AVqXhVcBuAGn6zsC6cnmNeczMrA2qvBtqbDqjQNIrgL8CllEkjaNTtZnAdWl4QRonTb8pij6yBcCMdLfUHsAk4M6q4jYzs5eqshtqV+DSdN1iG+DqiLhe0v3AfElnA3cB81L9ecDlkrqB9RR3QBERSyVdDdwPbAZOSd1bZmbWJpUli4i4F3hzjfKHqHE3U0T8HjimzrLOAc4Z6BjNzKw5/ga3mZllOVmYmVmWk4WZmWU5WZiZWZaThZmZZTlZmJlZlpOFmZllOVmYmVmWk4WZmWU5WZiZWZaThZmZZTlZmJlZlpOFmZllOVmYmVmWk4WZmWU5WZiZWZaThZmZZTlZmJlZVlPJQtKNzZSZmdnLU8Pf4Ja0A/BKYIyk0YDSpJ2A8RXHZmZmg0TDZAH8HXAa8BpgMS8mi6eAC6sLy8zMBpOGySIiLgAukPTxiPh6m2IyM7NBJndmAUBEfF3S24CJ5Xki4rKK4jIzs0GkqWQh6XJgT+Bu4IVUHICThZnZENBUsgC6gL0jIqoMxszMBqdmv2exBHh1lYGYmdng1eyZxRjgfkl3As/3FEbEeyuJyszMBpVmk8WcKoMwM7PBrdm7oW6pOhAzMxu8mr0b6mmKu58AtgO2BZ6JiJ2qCszMzAaPZs8sduwZliRgOnBIVUGZmdng0uenzkbh34HDBz4cMzMbjJrthnpfaXQbiu9d/L6SiMzMbNBp9m6o95SGNwMrKLqizMxsCGj2msUJVQdiZmaDV7M/fjRB0rWS1qbX9yVNqDo4MzMbHJq9wP2vwAKK37V4DfAfqczMzIaAZpPF2Ij414jYnF6XAGMrjMvMzAaRZpPFOkkflDQsvT4IrKsyMDMzGzyaTRYnAu8HHgdWA0cDsxrNIGk3STdLul/SUkmnpvJdJC2UtDz9HZ3KJelrkrol3SvpgNKyZqb6yyXNbOF9mplZPzSbLD4HzIyIsRHxKork8dnMPJuBT0TE3hTf9j5F0t7AbODGiJgE3JjGAY4AJqXXycC3oEguwFnAwcBBwFk9CcbMzNqj2WTxZxGxoWckItYDb240Q0Ssjohfp+GngWXAeIrvZ1yaql0KHJWGpwOXpW+I3w6MkrQrxTfFF0bE+hTDQmBak3GbmdkAUDM/fifpHmByT8JI/+3fEhH7NbUSaSJwK7Av8LuIGJXKBWyIiFGSrgfOjYjb0rQbgdOBycAOEXF2Kv8M8FxEfKnXOk6mOCNh3LhxB86fP79hTJs2bWLkyJE1p9236slm3hb7jd+5qXp90SiuTnNsrXFsrXFsrelPbFOmTFkcEV21pjX7De4vA7+Q9G9p/BjgnGZmlDQS+D5wWkQ8VeSHQkSEpAH5qdaImAvMBejq6orJkyc3rL9o0SLq1Zk1+4dNrXPFcY3X0YpGcXWaY2uNY2uNY2tNVbE11Q0VEZcB7wPWpNf7IuLy3HyStqVIFN+NiB+k4jWpe4n0d20qXwXsVpp9QiqrV25mZm3S9FNnI+L+iLgwve7P1U9dTPOAZRHxldKkBUDPHU0zgetK5cenu6IOAZ6MiNXADcBUSaPThe2pqczMzNqk2W6oVrwd+BBwn6S7U9k/AucCV0s6CXiE4pZcgB8BRwLdwLPACVBcTJf0eeCXqd7n0gV2MzNrk8qSRbpQrTqTD6tRP4BT6izrYuDigYvOzMz6os8/fmRmZkOPk4WZmWU5WZiZWZaThZmZZTlZmJlZVpW3zr7sTWzym94AK859V4WRmJlVy2cWZmaW5WRhZmZZThZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaW5WRhZmZZThZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaW5WRhZmZZThZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaW5WRhZmZZThZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaW5WRhZmZZThZmZpblZGFmZllOFmZmluVkYWZmWU4WZmaW5WRhZmZZwzsdwFAxcfYPm6p3ybQRFUdiZtZ3lZ1ZSLpY0lpJS0plu0haKGl5+js6lUvS1yR1S7pX0gGleWam+sslzawqXjMzq6/KbqhLgGm9ymYDN0bEJODGNA5wBDApvU4GvgVFcgHOAg4GDgLO6kkwZmbWPpUli4i4FVjfq3g6cGkavhQ4qlR+WRRuB0ZJ2hU4HFgYEesjYgOwkJcmIDMzq5giorqFSxOB6yNi3zS+MSJGpWEBGyJilKTrgXMj4rY07UbgdGAysENEnJ3KPwM8FxFfqrGukynOShg3btyB8+fPbxjbpk2bGDlyZM1p9616ss/vdaDssfOwunF1WqM26zTH1hrH1pqXa2xTpkxZHBFdtaZ17AJ3RISkActUETEXmAvQ1dUVkydPblh/0aJF1Kszq8mL0VW4ZNqIunF1WqM26zTH1hrH1pqhGFu7b51dk7qXSH/XpvJVwG6lehNSWb1yMzNro3YniwVAzx1NM4HrSuXHp7uiDgGejIjVwA3AVEmj04XtqanMzMzaqLJuKElXUlxzGCPpUYq7ms4FrpZ0EvAI8P5U/UfAkUA38CxwAkBErJf0eeCXqd7nIqL3RXMzM6tYZckiIo6tM+mwGnUDOKXOci4GLh7A0MzMrI/8uA8zM8tysjAzsywnCzMzy3KyMDOzLCcLMzPLcrIwM7MsJwszM8vyjx8NMveterLpZ1OtOPddFUdjZlbwmYWZmWU5WZiZWZaThZmZZTlZmJlZlpOFmZllOVmYmVmWb53dik30LbZm1iY+szAzsywnCzMzy3KyMDOzLCcLMzPLcrIwM7MsJwszM8tysjAzsywnCzMzy/KX8oaAZr+8B/4Cn5nV5jMLMzPLcrIwM7MsJwszM8vyNQvbQqPrG5/Yb3PTvw9e5usgZls/n1mYmVmWk4WZmWW5G8oq15dbd5vhbi2z9vOZhZmZZfnMwrY6tc5U6l1891mI2cBwsrCXNf/0rNnAcDeUmZll+czCrI98tmJDkZOFGQN/x1Z5ma1+mbEWJyDrFCcLs63IQCe1gUxkZU5qLz9OFmY24AYiqfUnkTlZDbytJllImgZcAAwD/iUizu1wSGY2SFXRrVhW1RnZQLhk2ohKlrtV3A0laRjwDeAIYG/gWEl7dzYqM7OhY6tIFsBBQHdEPBQR/wnMB6Z3OCYzsyFDEdHpGLIkHQ1Mi4i/TeMfAg6OiI+V6pwMnJxG3wA8kFnsGOCJCsLtr8EaFzi2Vjm21ji21vQntt0jYmytCVvNNYuciJgLzG22vqRfRURXhSG1ZLDGBY6tVY6tNY6tNVXFtrV0Q60CdiuNT0hlZmbWBltLsvglMEnSHpK2A2YACzock5nZkLFVdENFxGZJHwNuoLh19uKIWNrPxTbdZdVmgzUucGytcmytcWytqSS2reICt5mZddbW0g1lZmYd5GRhZmZZQy5ZSJom6QFJ3ZJmd2D9u0m6WdL9kpZKOjWVz5G0StLd6XVkaZ4zUrwPSDq84vhWSLovxfCrVLaLpIWSlqe/o1O5JH0txXavpAMqjOsNpba5W9JTkk7rVLtJuljSWklLSmV9bidJM1P95ZJmVhjbFyX9Jq3/WkmjUvlESc+V2u+i0jwHpn2hO8WvimLr8zas4nNcJ7arSnGtkHR3Km9buzU4ZrR3f4uIIfOiuDj+IPA6YDvgHmDvNsewK3BAGt4R+C3FI0zmAJ+sUX/vFOf2wB4p/mEVxrcCGNOr7J+B2Wl4NnBeGj4S+DEg4BDgjjZux8eB3TvVbsBfAAcAS1ptJ2AX4KH0d3QaHl1RbFOB4Wn4vFJsE8v1ei3nzhSvUvxHVBRbn7ZhVZ/jWrH1mv5l4J/a3W4Njhlt3d+G2plFxx8bEhGrI+LXafhpYBkwvsEs04H5EfF8RDwMdFO8j3aaDlyahi8FjiqVXxaF24FRknZtQzyHAQ9GxCMN6lTabhFxK7C+xjr70k6HAwsjYn1EbAAWAtOqiC0ifhIRm9Po7RTfVaorxbdTRNwexZHmstL7GdDYGqi3DSv5HDeKLZ0dvB+4stEyqmi3BseMtu5vQy1ZjAdWlsYfpfGBulKSJgJvBu5IRR9Lp40X95xS0v6YA/iJpMUqHqECMC4iVqfhx4FxHYqtxwy2/NAOhnaDvrdTp9rvRIr/PHvsIekuSbdI+vNUNj7F067Y+rINO9Fufw6siYjlpbK2t1uvY0Zb97ehliwGDUkjge8Dp0XEU8C3gD2B/YHVFKe8nXBoRBxA8YTfUyT9RXli+m+pY/dbq/hS5nuBf0tFg6XdttDpdqpH0pnAZuC7qWg18NqIeDPwv4DvSdqpzWENym3Yy7Fs+Q9K29utxjHjj9qxvw21ZDEoHhsiaVuKjf7diPgBQESsiYgXIuIPwHd4scukrTFHxKr0dy1wbYpjTU/3Uvq7thOxJUcAv46INSnOQdFuSV/bqa0xSpoFvBs4Lh1cSF0869LwYoprAa9PcZS7qiqLrYVt2O52Gw68D7iqFHNb263WMYM2729DLVl0/LEhqe9zHrAsIr5SKi/39f810HNHxgJghqTtJe0BTKK4gFZFbCMk7dgzTHFRdEmKoefOiZnAdaXYjk93XxwCPFk6La7KFv/hDYZ2K+lrO90ATJU0OnW9TE1lA07Fj4d9CnhvRDxbKh+r4vdikPQ6inZ6KMX3lKRD0j57fOn9DHRsfd2G7f4cvxP4TUT8sXupne1W75hBu/e3/lyl3xpfFHcK/JbiP4EzO7D+QylOF+8F7k6vI4HLgftS+QJg19I8Z6Z4H2AA7khpENvrKO4suQdY2tM+wJ8ANwLLgZ8Cu6RyUfwo1YMp9q6K224EsA7YuVTWkXajSFirgf+i6Ps9qZV2orh+0J1eJ1QYWzdFf3XPPndRqvs/0ra+G/g18J7ScrooDtwPAheSnvhQQWx93oZVfI5rxZbKLwE+0qtu29qN+seMtu5vftyHmZllDbVuKDMza4GThZmZZTlZmJlZlpOFmZllOVmYmVmWk4VZH0gaJemjmTqTJV3frpjM2sHJwqxvRgENk4XZy5GThVnfnAvsqeI3DL6YXktU/H7BB3pXlvSW9LC5PVX8zsEt6SGNN5Qe1bBI0nmS7pT0256H0knaJ5XdnR6yN6nN79Xsj5wszPpmNsXj0feneNT3/sCbKB4J8cXyoyskvQ24iOKR0b8Dvg4cHREHAhcD55SWOzwiDgJOA85KZR8BLkjr6mLLp5matdXwTgdgthU7FLgyIl6geKjbLcBbgKeANwJzgakR8ZikfYF9gYXFo34YRvFoiR49D4dbTPHDOgC/AM6UNAH4QWz5eGyztvKZhVk1VgO/p/jtASie17M0IvZPr/0iYmqp/vPp7wukf+Ii4nsUj2N/DviRpHe0J3Szl3KyMOubpyl+2hLg/wEfkDRM0liKn+XsebLtRuBdwBckTaZ4EN5YSW+F4pHTkvZptKL0NNOHIuJrFE8U/bOBfStmzXOyMOuDKH7D4GeSlgBvpXgS6D3ATcCnIuLxUt01FL8f8Q2KM4yjgfMk3UPx5NC3ZVb3fmCJpLspurAuG9A3Y9YHfuqsmZll+czCzMyynCzMzCzLycLMzLKcLMzMLMvJwszMspwszMwsy8nCzMyy/ht5y7j85OwQPgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "out_expanded[(out_expanded.tokens>=0)&(out_expanded.tokens < 2000)]['tokens'].hist(bins=29)\n", + "plt.xlabel('tokens')\n", + "plt.ylabel('count')\n", + "plt.title('Histogram of the number of minimum tokens needed')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can observe that the context is most likely to be returned as one of the first results, and most likely to be returned within the first 200-500 tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-2 0.000063\n", + "-1 0.074428\n", + " 0 0.453420\n", + " 1 0.089515\n", + " 2 0.047146\n", + " 3 0.032437\n", + " 4 0.024139\n", + " 5 0.019676\n", + " 6 0.015967\n", + " 7 0.013452\n", + " 8 0.011189\n", + " 9 0.009869\n", + " 10 0.009178\n", + "Name: rank, dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# normalized value_counts\n", + "out_expanded['rank'].value_counts(normalize=True).sort_index()[:13]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "probabilities of the relevant context being returned at each rank. (-2 means a processing error, -1 means the rank is >200)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.9 64-bit ('3.9.9')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "cb9817b186a29e4e9713184d901f26c1ee05ad25243d878baff7f31bb1fef480" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/fine-tuned_qa/olympics-3-train-qa.ipynb b/examples/fine-tuned_qa/olympics-3-train-qa.ipynb new file mode 100644 index 00000000..76fb3f3c --- /dev/null +++ b/examples/fine-tuned_qa/olympics-3-train-qa.ipynb @@ -0,0 +1,637 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Train a fine-tuning model specialized for Q&A\n", + "This notebook will utilize the dataset of context, question and answer pairs to additionally create adversarial questions and context pairs, where the question was not generated on that context. In those cases the model will be prompted to answer \"No sufficient context for answering the question\". We will also train a discriminator model, which predicts whether the question can be answered based on the context or not.\n", + "\n", + "We will add hard adversarial examples as well, which will be based either on semantically similar sections, or neighbouring sections, originating from the same article." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleheadingcontenttokenscontextquestionsanswers
02020 Summer OlympicsSummaryThe 2020 Summer Olympics (Japanese: 2020年夏季オリン...7132020 Summer Olympics\\nSummary\\n\\nThe 2020 Summ...1. What is the 2020 Summer Olympics?\\n2. When ...1. The 2020 Summer Olympics is an internationa...
12020 Summer OlympicsHost city selectionThe International Olympic Committee (IOC) vote...1262020 Summer Olympics\\nHost city selection\\n\\nT...1. \\n2. \\n3. \\n4.1. What is the International Olympic Committee...
22020 Summer OlympicsImpact of the COVID-19 pandemicIn January 2020, concerns were raised about th...3692020 Summer Olympics\\nImpact of the COVID-19 p...1. What was the COVID-19 pandemic?\\n2. How did...1. The COVID-19 pandemic was a pandemic that o...
32020 Summer OlympicsQualifying event cancellation and postponementConcerns about the pandemic began to affect qu...2982020 Summer Olympics\\nQualifying event cancell...1. What was the original location of the Asia ...1. The original location of the Asia & Oceania...
42020 Summer OlympicsEffect on doping testsMandatory doping tests were being severely res...1632020 Summer Olympics\\nEffect on doping tests\\n...1. What was the COVID-19 pandemic?\\n2. What di...1. The COVID-19 pandemic was a pandemic that o...
\n", + "
" + ], + "text/plain": [ + " title heading \\\n", + "0 2020 Summer Olympics Summary \n", + "1 2020 Summer Olympics Host city selection \n", + "2 2020 Summer Olympics Impact of the COVID-19 pandemic \n", + "3 2020 Summer Olympics Qualifying event cancellation and postponement \n", + "4 2020 Summer Olympics Effect on doping tests \n", + "\n", + " content tokens \\\n", + "0 The 2020 Summer Olympics (Japanese: 2020年夏季オリン... 713 \n", + "1 The International Olympic Committee (IOC) vote... 126 \n", + "2 In January 2020, concerns were raised about th... 369 \n", + "3 Concerns about the pandemic began to affect qu... 298 \n", + "4 Mandatory doping tests were being severely res... 163 \n", + "\n", + " context \\\n", + "0 2020 Summer Olympics\\nSummary\\n\\nThe 2020 Summ... \n", + "1 2020 Summer Olympics\\nHost city selection\\n\\nT... \n", + "2 2020 Summer Olympics\\nImpact of the COVID-19 p... \n", + "3 2020 Summer Olympics\\nQualifying event cancell... \n", + "4 2020 Summer Olympics\\nEffect on doping tests\\n... \n", + "\n", + " questions \\\n", + "0 1. What is the 2020 Summer Olympics?\\n2. When ... \n", + "1 1. \\n2. \\n3. \\n4. \n", + "2 1. What was the COVID-19 pandemic?\\n2. How did... \n", + "3 1. What was the original location of the Asia ... \n", + "4 1. What was the COVID-19 pandemic?\\n2. What di... \n", + "\n", + " answers \n", + "0 1. The 2020 Summer Olympics is an internationa... \n", + "1 1. What is the International Olympic Committee... \n", + "2 1. The COVID-19 pandemic was a pandemic that o... \n", + "3 1. The original location of the Asia & Oceania... \n", + "4 1. The COVID-19 pandemic was a pandemic that o... " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import openai\n", + "import pandas as pd\n", + "df = pd.read_csv('olympics-data/olympics_qa.csv')\n", + "olympics_search_fileid = \"file-c3shd8wqF3vSCKaukW4Jr1TT\"\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split the sections into a training and testing set" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3014, 754)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n", + "len(train_df), len(test_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we check that he separator we intend to use isn't present within the contexts" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.context.str.contains('->').sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.1 Create the fine-tuning datasets for Q&A and discriminator models\n", + "The fine-tuning dataset is created in the following way. For every corresponding question, answer and context pair we create:\n", + "- Positive example: correct question, answer, context pair\n", + "- Negative examples:\n", + " - random negative example, where the random context is paired with the question \n", + " - two hard negative examples\n", + " - one originating from the same wikipedia article\n", + " - another, which is most similar to the correct context\n", + "\n", + "This process is noisy, as sometimes the question might be answerable given a different context, but on average we hope this won't affect the peformance too much.\n", + "\n", + "We apply the same process of dataset creation for both the discriminator, and the Q&A answering model. We apply the process separately for the training and testing set, to ensure that the examples from the traing set don't feature within the test set." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def get_random_similar_contexts(question, context, file_id=olympics_search_fileid, search_model='ada', max_rerank=10):\n", + " \"\"\"\n", + " Find similar contexts to the given context using the search file\n", + " \"\"\"\n", + " try:\n", + " results = openai.Engine(search_model).search(\n", + " search_model=search_model, \n", + " query=question, \n", + " max_rerank=max_rerank,\n", + " file=file_id\n", + " )\n", + " candidates = []\n", + " for result in results['data'][:3]:\n", + " if result['text'] == context:\n", + " continue\n", + " candidates.append(result['text'])\n", + " random_candidate = random.choice(candidates)\n", + " return random_candidate\n", + " except Exception as e:\n", + " print(e)\n", + " return \"\"\n", + "\n", + "def create_fine_tuning_dataset(df, discriminator=False, n_negative=1, add_related=False):\n", + " \"\"\"\n", + " Create a dataset for fine tuning the OpenAI model; either for a discriminator model, \n", + " or a model specializing in Q&A, where it says if no relevant context is found.\n", + "\n", + " Parameters\n", + " ----------\n", + " df: pd.DataFrame\n", + " The dataframe containing the question, answer and context pairs\n", + " discriminator: bool\n", + " Whether to create a dataset for the discriminator\n", + " n_negative: int\n", + " The number of random negative samples to add (using a random context)\n", + " add_related: bool\n", + " Whether to add the related contexts to the correct context. These are hard negative examples\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " The dataframe containing the prompts and completions, ready for fine-tuning\n", + " \"\"\"\n", + " rows = []\n", + " for i, row in df.iterrows():\n", + " for q, a in zip((\"1.\" + row.questions).split('\\n'), (\"1.\" + row.answers).split('\\n')):\n", + " if len(q) >10 and len(a) >10:\n", + " if discriminator:\n", + " rows.append({\"prompt\":f\"{row.context}\\nQuestion: {q[2:].strip()}\\n Related:\", \"completion\":f\" yes\"})\n", + " else:\n", + " rows.append({\"prompt\":f\"{row.context}\\nQuestion: {q[2:].strip()}\\nAnswer:\", \"completion\":f\" {a[2:].strip()}\"})\n", + "\n", + " for i, row in df.iterrows():\n", + " for q in (\"1.\" + row.questions).split('\\n'):\n", + " if len(q) >10:\n", + " for j in range(n_negative + (2 if add_related else 0)):\n", + " random_context = \"\"\n", + " if j == 0 and add_related:\n", + " # add the related contexts based on originating from the same wikipedia page\n", + " subset = df[(df.title == row.title) & (df.context != row.context)]\n", + " \n", + " if len(subset) < 1:\n", + " continue\n", + " random_context = subset.sample(1).iloc[0].context\n", + " if j == 1 and add_related:\n", + " # add the related contexts based on the most similar contexts according to the search\n", + " random_context = get_random_similar_contexts(q[2:].strip(), row.context, search_model='ada', max_rerank=10)\n", + " else:\n", + " while True:\n", + " # add random context, which isn't the correct context\n", + " random_context = df.sample(1).iloc[0].context\n", + " if random_context != row.context:\n", + " break\n", + " if discriminator:\n", + " rows.append({\"prompt\":f\"{random_context}\\nQuestion: {q[2:].strip()}\\n Related:\", \"completion\":f\" no\"})\n", + " else:\n", + " rows.append({\"prompt\":f\"{random_context}\\nQuestion: {q[2:].strip()}\\nAnswer:\", \"completion\":f\" No appropriate context found to answer the question.\"})\n", + "\n", + " return pd.DataFrame(rows) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We apply the same process of dataset creation for both the discriminator, and the Q&A answering model. We apply the process separately for the training and testing set, to ensure that the examples from the traing set don't feature within the test set." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for name, is_disc in [('discriminator', True), ('qa', False)]:\n", + " for train_test, dt in [('train', train_df), ('test', test_df)]:\n", + " ft = create_fine_tuning_dataset(dt, discriminator=is_disc, n_negative=1, add_related=True)\n", + " ft.to_json(f'{name}_{train_test}.jsonl', orient='records', lines=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We formatted the data according to the recommendations from the fine-tuning tool, which is available using\n", + "> openai tools fine_tunes.prepare_data -f qa_train.jsonl\n", + "\n", + "We highly recommend that you use this tool, which suggests improvements in your data formatting for fine-tuning.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.2 Submit the datasets for fine-tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!openai api fine_tunes.create -t \"olympics-data/discriminator_train.jsonl\" -v \"olympics-data/discriminator_test.jsonl\" --batch_size 16 --compute_classification_metrics --classification_positive_class \" yes\" --model ada" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "!openai api fine_tunes.create -t \"olympics-data/qa_train.jsonl\" -v \"olympics-data/qa_test.jsonl\" --batch_size 16" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.3 Using the fine-tuned models\n", + "\n", + "We will now use the fine-tuned discriminator and the fine-tuned Q&A model. By requesting logprobs, we can see how certain the discriminator is in a `yes` vs `no` answer." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[ JSON: {\n", + " \" no\": -10.819577,\n", + " \" yes\": -2.045765e-05\n", + " }]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ft_discriminator = \"curie:ft-openai-internal-2021-08-23-23-58-57\"\n", + "ft_qa = \"curie:ft-openai-internal-2021-08-23-17-54-10\"\n", + "\n", + "def apply_ft_discriminator(context, question, discriminator_model):\n", + " \"\"\"\n", + " Apply the fine tuned discriminator to a question, to assess whether it can be answered from the context.\n", + " \"\"\"\n", + " prompt = f\"{context}\\nQuestion: {question}\\n Related:\"\n", + " result = openai.Completion.create(model=discriminator_model, prompt=prompt, max_tokens=1, temperature=0, top_p=1, n=1, logprobs=2)\n", + " return result['choices'][0]['logprobs']['top_logprobs']\n", + "\n", + "apply_ft_discriminator('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.', \n", + " 'What was the first human-made object in space?', ft_discriminator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the model can generalize well to different contexts and questions. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def apply_ft_qa_answer(context, question, answering_model):\n", + " \"\"\"\n", + " Apply the fine tuned discriminator to a question\n", + " \"\"\"\n", + " prompt = f\"{context}\\nQuestion: {question}\\nAnswer:\"\n", + " result = openai.Completion.create(model=answering_model, prompt=prompt, max_tokens=30, temperature=0, top_p=1, n=1, stop=['.','\\n'])\n", + " return result['choices'][0]['text']\n", + "\n", + "apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.', \n", + " 'What was the first human-made object in space?', ft_qa)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the model can answer the question, when the context is appropriate." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' The Soviet Union was the first country to successfully launch a satellite into space'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',\n", + " 'What is impressive about the Soviet Union?', ft_qa)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' No appropriate context found to answer the question'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "apply_ft_qa_answer('The first human-made object in space was the Soviet Union satellite Sputnik 1 on 4 October 1957.',\n", + " 'How many cars were produced in the Soviet Union in 1970?', ft_qa)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the model knows when to answer the question, and when to say that insufficient context is present to answer the question." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also combine a discriminator and a base model, or a fine-tuned Q&A model. Discriminator can essentially serve as a decision whether the question can be answered given the context or not." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' Weather could cause a sport event to have no crowd'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def answer_question_conditionally(answering_model, discriminator_model, context, question, discriminator_logprob_yes_modifier=0):\n", + " logprobs = apply_ft_discriminator(context, question, discriminator_model)\n", + " yes_logprob = logprobs[' yes'] if ' yes' in logprobs else -100\n", + " no_logprob = logprobs[' no'] if ' no' in logprobs else -100\n", + " if yes_logprob + discriminator_logprob_yes_modifier < no_logprob:\n", + " return \" No appropriate context found to answer the question based on the discriminator.\"\n", + " return apply_ft_qa_answer(context, question, answering_model)\n", + "answer_question_conditionally(ft_qa, ft_discriminator, \n", + " \"Crowdless games are a rare although not unheard-of occurrence in sports. \\\n", + " When they do occur, it is usually the result of events beyond the control \\\n", + " of the teams or fans, such as weather-related concerns, public health concerns, \\\n", + " or wider civil disturbances unrelated to the game. For instance, \\\n", + " the COVID-19 pandemic caused many sports leagues around the world \\\n", + " to be played behind closed doors.\",\n", + " \"Could weather cause a sport event to have no crowd?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above function illustrates how to potentially combine a discriminator and a fine-tuned Q&A model. This gives a more fine-grained control over how certain we want the model to be before it answers the question.\n", + "\n", + "We'll now take a look on how answers endpoint works - combining search to retrieve the relevant context from a knowledge base, and then using the fine-tuned Q&A model to answer the question." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.4 Answering the question based on a knowledge base\n", + "Finally we can use a logic similar to the [/answers](https://beta.openai.com/docs/api-reference/answers) endpoint, where we first search for the relevant context, and then ask a Q&A model to answer the question given that context. If you'd like to see the implementation details, check out the [`answers_with_ft.py`](answers_with_ft.py) file." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\" Canada won the Women's football tournament at the 2020 Olympic games\"" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from answers_with_ft import answer_question\n", + "answer_question(olympics_search_fileid, ft_qa, \"Which country won the Women's football tournament at the 2020 Olympic games?\")" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" + }, + "kernelspec": { + "display_name": "Python 3.7.3 64-bit ('base': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}