Organize notebooks

1 year ago · e531a5c0d6
parent 4924ce40f2
commit e531a5c0d6
235 changed files with 5200 additions and 17980 deletions
--- a/hotpotqa_runs/CotQA_simple.ipynb
+++ b/hotpotqa_runs/CotQA_simple.ipynb
--- a/hotpotqa_runs/cot_run_base.py
+++ b/hotpotqa_runs/cot_run_base.py
@ -1,84 +0,0 @@
-import joblib
-from react_cls import CoTAgent
-from mocks import DocStoreExplorerMock, LLMMock
-import numpy as np
-
-def summarize_trial(agents):
-    correct = [a for a in agents if a.is_correct()]
-    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]
-    return correct, incorrect
-
-def log_trial(agents, trial_n):
-    correct, incorrect = summarize_trial(agents)
-
-    log = f"""
-########################################
-BEGIN TRIAL {trial_n}
-Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}
-#######################################
-"""
-
-    log += '------------- BEGIN CORRECT AGENTS -------------\n\n'
-    for agent in correct:
-        log += f'Context: {agent.context} Question: {agent.question}{agent.scratchpad}\nCorrect answer: {agent.key}\n\n'
-
-    log += '------------- BEGIN INCORRECT AGENTS -----------\n\n'
-    for agent in incorrect:
-        log += f'Context: {agent.context} Question: {agent.question}{agent.scratchpad}\nCorrect answer: {agent.key}\n\n'
-    return log
-
-if __name__ == '__main__':
-    hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)
-    hotpot['supporting_paragraphs'] = None
-    for ind, row in hotpot.iterrows():
-        supporting_articles = row['supporting_facts']['title']
-        articles = row['context']['title']
-        sentences = row['context']['sentences'] 
-        supporting_paragraphs = []
-        for article in supporting_articles:
-            supporting_paragraph = ''.join(sentences[np.where(articles == article)][0])
-            supporting_paragraphs.append(supporting_paragraph)
-        hotpot.at[ind, 'supporting_paragraphs'] = supporting_paragraphs
-
-    for ind, row in hotpot.iterrows():
-        supporting_paragraphs = row['supporting_paragraphs']
-        supporting_paragraphs = '\n\n'.join(supporting_paragraphs)
-        hotpot.at[ind, 'supporting_paragraphs'] = supporting_paragraphs
-
-    agents = [CoTAgent(row['question'], row['supporting_paragraphs'], row['answer']) for _, row in hotpot.iterrows()]
-    trial = 0
-    log = ''
-    for agent in [a for a in agents if not a.is_correct()]:
-        agent.run(reflect = False)
-        print(f'Answer: {agent.key}')
-    trial += 1
-
-    log += log_trial(agents, trial)
-    correct, incorrect = summarize_trial(agents)
-    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')
-    dicts = [dict(a.__dict__) for a in agents]
-    for d in dicts:
-        for k, v in d.items():
-            d[k] = str(v)
-
-    joblib.dump(dicts, 'output/base_cot/cot_reflect_50_correct_dicts-8-trials.joblib')
-    print(log)
-
-    with open('output/base_cot/100_questions_8_trials.txt', 'w') as f:
-        f.write(log)
-
-    trial = 0
-    log = ''
-    q = 0
-    agents_to_run = [a for a in agents if not a.is_correct()]
-
-    while q < len(agents_to_run):
-        print(f'Trial: {trial} ({q}/{len(agents_to_run)})')
-        agents_to_run[q].run()
-        q += 1
-
-    trial += 1
-
-    log += log_trial(agents, trial)
-    correct, incorrect, halted = summarize_trial(agents)
-    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}')
--- a/hotpotqa_runs/notebooks/CoT_context/CotQA.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_context/CotQA.ipynb
@ -1,52 +1,42 @@
 {
 "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Notebook for running CoT with context + Reflexion"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import joblib\n",
-    "from react_cls import CoTAgent\n",
-    "from mocks import DocStoreExplorerMock, LLMMock\n",
-    "import numpy as np"
+    "import sys, os\n",
+    "sys.path.append('../../')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def summarize_trial(agents):\n",
-    "    correct = [a for a in agents if a.is_correct()]\n",
-    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
-    "    return correct, incorrect\n",
-    "\n",
-    "def remove_fewshot(prompt: str) -> str:\n",
-    "    prefix = prompt.split('Here are some examples:')[0]\n",
-    "    suffix = prompt.split('(END OF EXAMPLES)')[1]\n",
-    "    return prefix.strip('\\n').strip() + '\\n' +  suffix.strip('\\n').strip()\n",
-    "\n",
-    "def log_trial(agents, trial_n):\n",
-    "    correct, incorrect = summarize_trial(agents)\n",
-    "\n",
-    "    log = f\"\"\"\n",
-    "########################################\n",
-    "BEGIN TRIAL {trial_n}\n",
-    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}\n",
-    "#######################################\n",
-    "\"\"\"\n",
-    "\n",
-    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
-    "    for agent in correct:\n",
-    "        log += remove_fewshot(agent._build_agent_prompt()) + f'\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
-    "    for agent in incorrect:\n",
-    "        log += remove_fewshot(agent._build_agent_prompt()) + f'\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    return log"
+    "imp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "from react_cls import CoTAgent\n",
+    "from mocks import DocStoreExplorerMock, LLMMock\n",
+    "import numpy as np"
   ]
  },
  {
--- a/hotpotqa_runs/notebooks/CoT_context/CotQA_base.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_context/CotQA_base.ipynb
@ -9,38 +9,8 @@
    "import joblib\n",
    "from react_cls import CoTAgent\n",
    "from mocks import DocStoreExplorerMock, LLMMock\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def summarize_trial(agents):\n",
-    "    correct = [a for a in agents if a.is_correct()]\n",
-    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
-    "    return correct, incorrect\n",
-    "\n",
-    "def log_trial(agents, trial_n):\n",
-    "    correct, incorrect = summarize_trial(agents)\n",
-    "\n",
-    "    log = f\"\"\"\n",
-    "########################################\n",
-    "BEGIN TRIAL {trial_n}\n",
-    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}\n",
-    "#######################################\n",
-    "\"\"\"\n",
-    "\n",
-    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
-    "    for agent in correct:\n",
-    "        log += f'Context: {agent.context} Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
-    "    for agent in incorrect:\n",
-    "        log += f'Context: {agent.context} Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "    return log"
+    "import numpy as np\n",
+    "from util import summarize_trial, log_trial"
   ]
  },
  {
--- a/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple.ipynb
--- a/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple_base.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple_base.ipynb
@ -1,11 +1,31 @@
 {
 "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Notebook for running Chain-of-Thought with no context"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "sys.path.append('../..')\n",
+    "root = '../../root/'"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
+    "from util import summarize_trial, log_trial, save_agents\n",
    "import joblib\n",
    "from react_cls import CoTAgent\n",
    "from mocks import DocStoreExplorerMock, LLMMock\n",
@ -13,34 +33,11 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 21,
+   "attachments": {},
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "def summarize_trial(agents):\n",
-    "    correct = [a for a in agents if a.is_correct()]\n",
-    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
-    "    return correct, incorrect\n",
-    "\n",
-    "def log_trial(agents, trial_n):\n",
-    "    correct, incorrect = summarize_trial(agents)\n",
-    "\n",
-    "    log = f\"\"\"\n",
-    "########################################\n",
-    "BEGIN TRIAL {trial_n}\n",
-    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}\n",
-    "#######################################\n",
-    "\"\"\"\n",
-    "\n",
-    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
-    "    for agent in correct:\n",
-    "        log += f'Context: {agent.context}\\nQuestion: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
-    "    for agent in incorrect:\n",
-    "        log += f'Context: {agent.context}\\nQuestion: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "    return log"
+    "#### Load the HotPotQA Sample"
   ]
  },
  {
@ -52,6 +49,14 @@
    "hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
   ]
  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize a CoTAgent for each question"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 23,
@ -67,13 +72,11 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 24,
+   "attachments": {},
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "trial = 0\n",
-    "log = ''"
+    "#### Run trials"
   ]
  },
  {
@ -1201,6 +1204,8 @@
    }
   ],
   "source": [
+    "trial = 0\n",
+    "log = ''\n",
    "for i in range(5):\n",
    "    for agent in [a for a in agents if not a.is_correct()]:\n",
    "        agent.run(reflect = False)\n",
@ -1212,93 +1217,23 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('output/base_cot_no_context/100_questions_5_trials.txt', 'w') as f:\n",
-    "    f.write(log)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['output/base_cot_no_context/cot_33_correct_dicts-5-trials.joblib']"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dicts = [dict(a.__dict__) for a in agents]\n",
-    "for d in dicts:\n",
-    "    for k, v in d.items():\n",
-    "        d[k] = str(v)\n",
-    "\n",
-    "joblib.dump(dicts, 'output/base_cot_no_context/cot_33_correct_dicts-5-trials.joblib')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
+   "attachments": {},
+   "cell_type": "markdown",
   "metadata": {},
-   "outputs": [],
   "source": [
-    "with open('output/base_cot/100_questions_8_trials.txt', 'w') as f:\n",
-    "    f.write(log)"
+    "#### Save the result log"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
-    "dicts = joblib.load('output/base_cot/cot_reflect_50_correct_dicts-8-trials.joblib')"
+    "with open(root + '/CoT/no_context/last_trial_and_reflexion/100_questions_5_trials.txt', 'w') as f:\n",
+    "    f.write(log)\n",
+    "save_agents(agents, root + '/CoT/no_context/last_trial_and_reflexion/')"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['question', 'context', 'key', 'agent_prompt', 'reflect_prompt', 'cot_examples', 'reflect_examples', 'llm', 'reflections', 'answer', 'step_n', 'scratchpad', 'finished'])"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dicts[0].keys()\n",
-    "for d in dicts:\n",
-    "    agent = CoTAgent(d['question'], d['context'], d['key'])\n",
-    "    agent.reflections = d['reflections']\n",
-    "    agent.scratchpad = d['scratchpad']\n",
-    "    agent.answer = d['answer']\n",
-    "    agent.step_n = d['step_n']\n",
-    "    agent.finished = d['finished']\n",
-    "    agents.append(agent)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/hotpotqa_runs/notebooks/ReAct/ReactQA.ipynb
+++ b/hotpotqa_runs/notebooks/ReAct/ReactQA.ipynb
--- a/hotpotqa_runs/notebooks/ReAct/ReactReflectQA.ipynb
+++ b/hotpotqa_runs/notebooks/ReAct/ReactReflectQA.ipynb
--- a/hotpotqa_runs/root/.DS_Store
+++ b/hotpotqa_runs/root/.DS_Store
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/0.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/0.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/1.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/1.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/10.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/10.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/11.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/11.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/12.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/12.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/13.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/13.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/14.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/14.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/15.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/15.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/16.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/16.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/17.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/17.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/18.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/18.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/19.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/19.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/2.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/2.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/20.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/20.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/21.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/21.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/22.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/22.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/23.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/23.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/24.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/24.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/25.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/25.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/26.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/26.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/27.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/27.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/28.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/28.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/29.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/29.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/3.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/3.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/30.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/30.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/31.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/31.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/32.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/32.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/33.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/33.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/34.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/34.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/35.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/35.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/36.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/36.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/37.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/37.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/38.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/38.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/39.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/39.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/4.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/4.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/40.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/40.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/41.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/41.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/42.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/42.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/43.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/43.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/44.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/44.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/45.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/45.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/46.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/46.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/47.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/47.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/48.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/48.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/49.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/49.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/5.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/5.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/50.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/50.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/51.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/51.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/52.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/52.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/53.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/53.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/54.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/54.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/55.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/55.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/56.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/56.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/57.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/57.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/58.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/58.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/59.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/59.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/6.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/6.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/60.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/60.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/61.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/61.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/62.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/62.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/63.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/63.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/64.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/64.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/65.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/65.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/66.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/66.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/67.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/67.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/68.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/68.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/69.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/69.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/7.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/7.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/70.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/70.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/71.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/71.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/72.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/72.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/73.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/73.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/74.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/74.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/75.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/75.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/76.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/76.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/77.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/77.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/78.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/78.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/79.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/79.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/8.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/8.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/80.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/80.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/81.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/81.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/82.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/82.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/83.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/83.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/84.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/84.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/85.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/85.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/86.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/86.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/87.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/87.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/88.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/88.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/89.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/89.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/9.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/9.joblib
--- a/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/90.joblib
+++ b/hotpotqa_runs/root/CoT/context/last_trial/100_questions_5_trials/90.joblib
--- a/Show More
+++ b/Show More