HotPotQA runs

1 year ago · 5942b44c41
parent 5269ef4ae0
commit 5942b44c41
243 changed files with 107561 additions and 0 deletions
--- a/hotpotqa_runs/CotQA.ipynb
+++ b/hotpotqa_runs/CotQA.ipynb
--- a/hotpotqa_runs/CotQA_base.ipynb
+++ b/hotpotqa_runs/CotQA_base.ipynb
--- a/hotpotqa_runs/CotQA_simple.ipynb
+++ b/hotpotqa_runs/CotQA_simple.ipynb
--- a/hotpotqa_runs/CotQA_simple_base.ipynb
+++ b/hotpotqa_runs/CotQA_simple_base.ipynb
--- a/hotpotqa_runs/ReactQA.ipynb
+++ b/hotpotqa_runs/ReactQA.ipynb
@ -0,0 +1,245 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "from react_cls import ReactAgent\n",
+    "from mocks import DocStoreExplorerMock, LLMMock"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summarize_trial(agents):\n",
+    "    correct = [a for a in agents if a.is_correct()]\n",
+    "    halted = [a for a in agents if a.is_halted()]\n",
+    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
+    "    return correct, incorrect, halted\n",
+    "\n",
+    "def log_trial(agents, trial_n):\n",
+    "    correct, incorrect, halted = summarize_trial(agents)\n",
+    "\n",
+    "    log = f\"\"\"\n",
+    "########################################\n",
+    "BEGIN TRIAL {trial_n}\n",
+    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}\n",
+    "#######################################\n",
+    "\"\"\"\n",
+    "\n",
+    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
+    "    for agent in correct:\n",
+    "        log += f'Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
+    "\n",
+    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
+    "    for agent in incorrect:\n",
+    "        log += f'Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
+    "\n",
+    "    log += '------------- BEGIN HALTED AGENTS --------------\\n\\n'\n",
+    "    for agent in halted:\n",
+    "        log += f'Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
+    "\n",
+    "    return log"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agents = [ReactAgent(row['question'], row['answer']) for _, row in hotpot.iterrows()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trial = 0\n",
+    "log = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trial: 4 (0/66)\n",
+      "Trial: 4 (1/66)\n",
+      "Trial: 4 (2/66)\n",
+      "Trial: 4 (3/66)\n",
+      "Trial: 4 (4/66)\n",
+      "Trial: 4 (5/66)\n",
+      "Trial: 4 (6/66)\n",
+      "Trial: 4 (7/66)\n",
+      "Trial: 4 (8/66)\n",
+      "Trial: 4 (9/66)\n",
+      "Trial: 4 (10/66)\n",
+      "Trial: 4 (11/66)\n",
+      "Trial: 4 (12/66)\n",
+      "Trial: 4 (13/66)\n",
+      "Trial: 4 (14/66)\n",
+      "Trial: 4 (15/66)\n",
+      "Trial: 4 (16/66)\n",
+      "Trial: 4 (17/66)\n",
+      "Trial: 4 (18/66)\n",
+      "Trial: 4 (19/66)\n",
+      "Trial: 4 (20/66)\n",
+      "Trial: 4 (21/66)\n",
+      "Trial: 4 (22/66)\n",
+      "Trial: 4 (23/66)\n",
+      "Trial: 4 (24/66)\n",
+      "Trial: 4 (25/66)\n",
+      "Trial: 4 (26/66)\n",
+      "Trial: 4 (27/66)\n",
+      "Trial: 4 (28/66)\n",
+      "Trial: 4 (29/66)\n",
+      "Trial: 4 (30/66)\n",
+      "Trial: 4 (31/66)\n",
+      "Trial: 4 (32/66)\n",
+      "Trial: 4 (33/66)\n",
+      "Trial: 4 (34/66)\n",
+      "Trial: 4 (35/66)\n",
+      "Trial: 4 (36/66)\n",
+      "Trial: 4 (37/66)\n",
+      "Trial: 4 (38/66)\n",
+      "Trial: 4 (39/66)\n",
+      "Trial: 4 (40/66)\n",
+      "Trial: 4 (41/66)\n",
+      "Trial: 4 (42/66)\n",
+      "Trial: 4 (43/66)\n",
+      "Trial: 4 (44/66)\n",
+      "Trial: 4 (45/66)\n",
+      "Trial: 4 (46/66)\n",
+      "Trial: 4 (47/66)\n",
+      "Trial: 4 (48/66)\n",
+      "Trial: 4 (49/66)\n",
+      "Trial: 4 (50/66)\n",
+      "Trial: 4 (51/66)\n",
+      "Trial: 4 (52/66)\n",
+      "Trial: 4 (53/66)\n",
+      "Trial: 4 (54/66)\n",
+      "Trial: 4 (55/66)\n",
+      "Trial: 4 (56/66)\n",
+      "Trial: 4 (57/66)\n",
+      "Trial: 4 (58/66)\n",
+      "Trial: 4 (59/66)\n",
+      "Trial: 4 (60/66)\n",
+      "Trial: 4 (61/66)\n",
+      "Trial: 4 (62/66)\n",
+      "Trial: 4 (63/66)\n",
+      "Trial: 4 (64/66)\n",
+      "Trial: 4 (65/66)\n",
+      "Finished Trial 5, Correct: 34, Incorrect: 56, Halted: 12\n"
+     ]
+    }
+   ],
+   "source": [
+    "agents_to_run = [a for a in agents if not a.is_correct()]\n",
+    "\n",
+    "while q < len(agents_to_run):\n",
+    "    print(f'Trial: {trial} ({q}/{len(agents_to_run)})')\n",
+    "    agents_to_run[q].run()\n",
+    "    q += 1\n",
+    "\n",
+    "trial += 1\n",
+    "\n",
+    "log += log_trial(agents, trial)\n",
+    "correct, incorrect, halted = summarize_trial(agents)\n",
+    "print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('output/base_react/100_questions_5_trials.txt', 'w') as f:\n",
+    "    f.write(log)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/base_react_dicts.joblib']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dicts = [dict(a.__dict__) for a in agents]\n",
+    "for d in dicts:\n",
+    "    for k, v in d.items():\n",
+    "        d[k] = str(v)\n",
+    "\n",
+    "joblib.dump(dicts, 'output/base_react_dicts.joblib')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hotpotqa_runs/ReactReflectQA.ipynb
+++ b/hotpotqa_runs/ReactReflectQA.ipynb
@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "from react_cls import ReactReflectAgent, format_reflections\n",
+    "from mocks import DocStoreExplorerMock, LLMMock"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summarize_trial(agents):\n",
+    "    correct = [a for a in agents if a.is_correct()]\n",
+    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
+    "    return correct, incorrect\n",
+    "\n",
+    "def remove_fewshot(prompt: str) -> str:\n",
+    "    prefix = prompt.split('Here are some examples:')[0]\n",
+    "    suffix = prompt.split('(END OF EXAMPLES)')[1]\n",
+    "    return prefix.strip('\\n').strip() +'\\n' +  suffix.strip('\\n').strip()\n",
+    "\n",
+    "def log_trial(agents, trial_n):\n",
+    "    correct, incorrect = summarize_trial(agents)\n",
+    "\n",
+    "    log = f\"\"\"\n",
+    "########################################\n",
+    "BEGIN TRIAL {trial_n}\n",
+    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}\n",
+    "#######################################\n",
+    "\"\"\"\n",
+    "\n",
+    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
+    "    for agent in correct:\n",
+    "        log += remove_fewshot(agent._build_agent_prompt()) + f'\\nCorrect answer: {agent.key}\\n\\n'\n",
+    "\n",
+    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
+    "    for agent in incorrect:\n",
+    "        log += remove_fewshot(agent._build_agent_prompt()) + f'\\nCorrect answer: {agent.key}\\n\\n'\n",
+    "\n",
+    "    return log\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agents = [ReactReflectAgent(row['question'], row['answer']) for _, row in hotpot.iterrows()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trial = 0\n",
+    "log = ''\n",
+    "last_correct = 0 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for agent in [a for a in agents if not a.is_correct()]:\n",
+    "        agent.run(reflect_strategy='last_attempt')\n",
+    "        print(f'Answer: {agent.key}')\n",
+    "trial += 1\n",
+    "log += log_trial(agents, trial)\n",
+    "correct, incorrect = summarize_trial(agents)\n",
+    "print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/last_trial_react/react_incorrect_dicts_trial_0.joblib']"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dicts = [dict(a.__dict__) for a in incorrect]\n",
+    "for d in dicts:\n",
+    "    for k, v in d.items():\n",
+    "        d[k] = str(v)\n",
+    "\n",
+    "joblib.dump(dicts, 'output/last_trial_react/react_incorrect_dicts_trial_0.joblib')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "while last_correct != correct:\n",
+    "    last_correct, _ = summarize_trial(agents)\n",
+    "    for agent in [a for a in agents if not a.is_correct()]:\n",
+    "        agent.run(reflect_strategy='last_attempt')\n",
+    "        print(f'Answer: {agent.key}')\n",
+    "    trial += 1\n",
+    "    log += log_trial(agents, trial)\n",
+    "    correct, incorrect = summarize_trial(agents)\n",
+    "    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for agent in [a for a in agents if not a.is_correct()]:\n",
+    "        agent.run(reflect_strategy='last_attempt + reflexion')\n",
+    "        print(f'Answer: {agent.key}')\n",
+    "trial += 1\n",
+    "log += log_trial(agents, trial)\n",
+    "correct, incorrect = summarize_trial(agents)\n",
+    "print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('output/last_trial_react/100_questions_5_trials.txt', 'w') as f:\n",
+    "    f.write(log)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/reflect/react_reflect_50_correct_dicts.joblib']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dicts = [dict(a.__dict__) for a in correct]\n",
+    "for d in dicts:\n",
+    "    for k, v in d.items():\n",
+    "        d[k] = str(v)\n",
+    "\n",
+    "joblib.dump(dicts, 'output/reflect/react_reflect_50_correct_dicts.joblib')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hotpotqa_runs/agent.py
+++ b/hotpotqa_runs/agent.py
@ -0,0 +1,172 @@
+import os
+from typing import List
+import dotenv
+
+import gym
+import tiktoken
+from langchain import OpenAI
+from langchain.llms.base import BaseLLM
+from langchain.prompts import PromptTemplate
+
+from environment import QAEnv
+from prompts import reflect_prompt, react_agent_prompt, react_reflect_agent_prompt, REFLECTION_HEADER
+from fewshots import WEBTHINK_SIMPLE6, REFLECTIONS
+
+dotenv.load_dotenv()
+
+class ReactAgent:
+    """
+    A question answering ReAct Agent.
+    """
+    def __init__(self,
+                 question: str,
+                 env: QAEnv,
+                 agent_prompt: PromptTemplate = react_agent_prompt,
+                 react_llm: BaseLLM = OpenAI(
+                                             temperature=0,
+                                             max_tokens=100,
+                                             model_name="text-davinci-003",
+                                             model_kwargs={"stop": "\n"},
+                                             openai_api_key=os.environ['OPENAI_API_KEY']),
+                 ) -> None:
+        
+        self.question = question
+        self.agent_prompt = agent_prompt
+        self.react_examples = WEBTHINK_SIMPLE6
+
+        self.env = env
+        self.env.reset()
+        self.reset()
+        self.truncated, self.reward, self.terminated = False, False, False
+
+        self.llm = react_llm
+        
+        self.enc = tiktoken.encoding_for_model("text-davinci-003")
+
+    def run(self, reset = True) -> None:
+        if reset:
+            self.env.reset()
+            self.reset()
+        
+        while not (self.is_truncated() or self.is_terminated()):
+            self.step()
+    
+    def step(self) -> None:
+        # Think
+        self.scratchpad += f'\nThought {self.curr_step}:'
+        self.scratchpad += ' ' + self.prompt_agent()
+        print(self.scratchpad.split('\n')[-1])
+
+        # Act
+        self.scratchpad += f'\nAction {self.curr_step}:'
+        action = self.prompt_agent()
+        self.scratchpad += ' ' + action
+        print(self.scratchpad.split('\n')[-1])
+
+        # Observe
+        self.scratchpad += f'\nObservation {self.curr_step}: '
+        observation, self.reward, self.terminated, self.truncated, self.curr_step = self.env.step(action)
+        self.scratchpad += observation
+        print(self.scratchpad.split('\n')[-1])
+
+    def prompt_agent(self) -> str:
+        return format_step(self.llm(self._build_agent_prompt()))
+    
+    def _build_agent_prompt(self) -> str:
+        return self.agent_prompt.format(
+                            examples = self.react_examples,
+                            question = self.question,
+                            scratchpad = self.scratchpad)
+    
+    def is_terminated(self) -> bool:
+        return self.env.is_terminated()
+
+    def is_correct(self) -> bool:
+        return self.env.is_correct()
+
+    def is_truncated(self) -> bool:
+        return self.env.is_truncated() or (len(self.enc.encode(self._build_agent_prompt())) > 3896)
+
+    def reset(self) -> None:
+        self.scratchpad = ''
+        self.curr_step = 1
+
+
+class ReactReflectAgent(ReactAgent):
+    """
+    A question answering Self-Reflecting React Agent.
+    """
+    def __init__(self,
+                 question: str,
+                 env: QAEnv,
+                 agent_prompt: PromptTemplate = react_reflect_agent_prompt,
+                 reflect_prompt: PromptTemplate = reflect_prompt,
+                 react_llm: BaseLLM = OpenAI(
+                                             temperature=0,
+                                             max_tokens=100,
+                                             model_name="text-davinci-003",
+                                             model_kwargs={"stop": "\n"},
+                                             openai_api_key=os.environ['OPENAI_API_KEY']),
+                 reflect_llm: BaseLLM = OpenAI(
+                                               temperature=0,
+                                               max_tokens=250,
+                                               model_name="text-davinci-003",
+                                               openai_api_key=os.environ['OPENAI_API_KEY']),
+                 ) -> None:
+        
+        super().__init__(question, env, agent_prompt, react_llm)
+        self.reflect_llm = reflect_llm
+        self.reflect_prompt = reflect_prompt
+        self.reflect_examples = REFLECTIONS
+        self.reflections = []
+    
+    def run(self, reset = True) -> None:
+        if (self.is_terminated() or self.is_truncated()) and not self.is_correct():
+            self.reflect()
+
+        ReactAgent.run(self, reset)
+    
+    def reflect(self) -> None:
+        self.reflections.append(self.prompt_reflection())
+    
+    def prompt_reflection(self) -> str:
+        return format_step(self.reflect_llm(self._build_reflection_prompt()))
+
+
+    def _build_reflection_prompt(self) -> str:
+        return self.reflect_prompt.format(
+                            examples = self.reflect_examples,
+                            question = self.question,
+                            scratchpad = self._format_scratchpad())
+    
+    def _build_agent_prompt(self) -> str:
+        return self.agent_prompt.format(
+                            examples = self.react_examples,
+                            reflections = format_reflections(self.reflections),
+                            question = self.question,
+                            scratchpad = self.scratchpad)
+    
+    def _format_scratchpad(self) -> str:
+        lines = self.scratchpad.split('\n')
+        lines_by_tokens = sorted(lines, key=lambda x: len(self.enc.encode(x)))
+        while len(self.enc.encode('\n'.join(lines))) > 1600:
+            ind = lines.index(lines_by_tokens.pop(-1))
+            line = lines[ind]
+            lines[ind]  = line.split(':')[0] + ': ...'
+        return '\n'.join(lines)
+    
+    
+
+### String Operations ###
+def format_reflections(reflections: List[str]) -> str:
+    if reflections == []:
+        return ''
+    else:
+        header = REFLECTION_HEADER
+        return header + 'Reflections:\n- ' + '\n- '.join([r.strip() for r in reflections])
+
+def format_step(step: str) -> str:
+    return step.strip('\n').strip().replace('\n', '')
+
+
+
--- a/hotpotqa_runs/cot_run_base.py
+++ b/hotpotqa_runs/cot_run_base.py
@ -0,0 +1,84 @@
+import joblib
+from react_cls import CoTAgent
+from mocks import DocStoreExplorerMock, LLMMock
+import numpy as np
+
+def summarize_trial(agents):
+    correct = [a for a in agents if a.is_correct()]
+    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]
+    return correct, incorrect
+
+def log_trial(agents, trial_n):
+    correct, incorrect = summarize_trial(agents)
+
+    log = f"""
+########################################
+BEGIN TRIAL {trial_n}
+Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}
+#######################################
+"""
+
+    log += '------------- BEGIN CORRECT AGENTS -------------\n\n'
+    for agent in correct:
+        log += f'Context: {agent.context} Question: {agent.question}{agent.scratchpad}\nCorrect answer: {agent.key}\n\n'
+
+    log += '------------- BEGIN INCORRECT AGENTS -----------\n\n'
+    for agent in incorrect:
+        log += f'Context: {agent.context} Question: {agent.question}{agent.scratchpad}\nCorrect answer: {agent.key}\n\n'
+    return log
+
+if __name__ == '__main__':
+    hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)
+    hotpot['supporting_paragraphs'] = None
+    for ind, row in hotpot.iterrows():
+        supporting_articles = row['supporting_facts']['title']
+        articles = row['context']['title']
+        sentences = row['context']['sentences'] 
+        supporting_paragraphs = []
+        for article in supporting_articles:
+            supporting_paragraph = ''.join(sentences[np.where(articles == article)][0])
+            supporting_paragraphs.append(supporting_paragraph)
+        hotpot.at[ind, 'supporting_paragraphs'] = supporting_paragraphs
+
+    for ind, row in hotpot.iterrows():
+        supporting_paragraphs = row['supporting_paragraphs']
+        supporting_paragraphs = '\n\n'.join(supporting_paragraphs)
+        hotpot.at[ind, 'supporting_paragraphs'] = supporting_paragraphs
+
+    agents = [CoTAgent(row['question'], row['supporting_paragraphs'], row['answer']) for _, row in hotpot.iterrows()]
+    trial = 0
+    log = ''
+    for agent in [a for a in agents if not a.is_correct()]:
+        agent.run(reflect = False)
+        print(f'Answer: {agent.key}')
+    trial += 1
+
+    log += log_trial(agents, trial)
+    correct, incorrect = summarize_trial(agents)
+    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')
+    dicts = [dict(a.__dict__) for a in agents]
+    for d in dicts:
+        for k, v in d.items():
+            d[k] = str(v)
+
+    joblib.dump(dicts, 'output/base_cot/cot_reflect_50_correct_dicts-8-trials.joblib')
+    print(log)
+
+    with open('output/base_cot/100_questions_8_trials.txt', 'w') as f:
+        f.write(log)
+
+    trial = 0
+    log = ''
+    q = 0
+    agents_to_run = [a for a in agents if not a.is_correct()]
+
+    while q < len(agents_to_run):
+        print(f'Trial: {trial} ({q}/{len(agents_to_run)})')
+        agents_to_run[q].run()
+        q += 1
+
+    trial += 1
+
+    log += log_trial(agents, trial)
+    correct, incorrect, halted = summarize_trial(agents)
+    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}')
--- a/hotpotqa_runs/data/hotpot-qa-distractor-sample.joblib
+++ b/hotpotqa_runs/data/hotpot-qa-distractor-sample.joblib
--- a/hotpotqa_runs/environment.py
+++ b/hotpotqa_runs/environment.py
@ -0,0 +1,101 @@
+import re
+import string
+from typing import Tuple
+
+import gym
+from langchain import Wikipedia
+from langchain.agents.react.base import DocstoreExplorer
+
+class QAEnv(gym.Env):
+    def __init__(self,
+                 question: str,
+                 key: str,
+                 max_steps: int = 6,
+                 explorer: DocstoreExplorer = DocstoreExplorer(Wikipedia())):
+        
+        self.question = question
+        self.key = key
+        self.max_steps = max_steps
+        self.explorer = explorer
+
+        self.reset()
+
+    def reset(self):
+          self.curr_step = 0
+          self.terminated = False
+          self.answer = ''
+
+    def step(self, action: str) -> Tuple[str, bool, bool, bool, bool]:
+        action_type, argument = parse_action(action)
+
+        if action_type == 'Finish':
+            self.answer = argument
+            if self.is_correct():
+                observation = 'Answer is CORRECT'
+            else: 
+                observation = 'Answer is INCORRECT'
+            self.terminated = True
+
+        elif action_type == 'Search':
+            try:
+                observation = self.explorer.search(argument).strip('\n').strip()
+            except Exception as e:
+                print(e)
+                observation = f'Could not find that page, please try again.'
+                    
+        elif action_type == 'Lookup':
+            try:
+                observation = self.explorer.lookup(argument).strip('\n').strip()
+            except ValueError:
+                observation = f'The last page Searched was not found, so you cannot Lookup a keyword in it. Please try one of the similar pages given.'
+
+        else:
+            observation = 'Invalid Action. Valid Actions are Lookup[<topic>] Search[<topic>] and Finish[<answer>].'
+
+        reward = self.is_correct()
+        terminated = self.is_terminated()
+        truncated = self.is_truncated()
+
+        self.curr_step += 1
+
+        return observation, reward, terminated, truncated, self.curr_step
+
+    def is_correct(self) -> bool:
+        return EM(self.answer, self.key)
+    
+    def is_terminated(self) -> bool:
+        return self.terminated
+
+    def is_truncated(self) -> bool:
+        return self.curr_step >= self.max_steps
+
+def parse_action(string):
+    pattern = r'^(\w+)\[(.+)\]$'
+    match = re.match(pattern, string)
+    
+    if match:
+        action_type = match.group(1)
+        argument = match.group(2)
+        return action_type, argument
+    
+    else:
+        return None, None
+
+def normalize_answer(s):
+  def remove_articles(text):
+    return re.sub(r"\b(a|an|the)\b", " ", text)
+  
+  def white_space_fix(text):
+      return " ".join(text.split())
+
+  def remove_punc(text):
+      exclude = set(string.punctuation)
+      return "".join(ch for ch in text if ch not in exclude)
+
+  def lower(text):
+      return text.lower()
+
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def EM(answer, key) -> bool:
+    return normalize_answer(answer) == normalize_answer(key)
--- a/hotpotqa_runs/fewshots.py
+++ b/hotpotqa_runs/fewshots.py
@ -0,0 +1,198 @@
+WEBTHINK_SIMPLE6 = """Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
+Action 1: Search[Colorado orogeny]
+Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
+Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
+Action 2: Lookup[eastern sector]
+Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.
+Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.
+Action 3: Search[High Plains]
+Observation 3: High Plains refers to one of two distinct land regions:
+Thought 4: I need to instead search High Plains (United States).
+Action 4: Search[High Plains (United States)]
+Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]
+Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
+Action 5: Finish[1,800 to 7,000 ft]
+
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.
+Action 1: Search[Milhouse]
+Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
+Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
+Action 2: Lookup[named after]
+Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. 
+Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
+Action 3: Finish[Richard Nixon]
+
+Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
+Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.
+Action 1: Search[Adam Clayton Powell]
+Observation 1: Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].
+Thought 2: To find the documentary, I can search Adam Clayton Powell (film).
+Action 2: Search[Adam Clayton Powell (film)]
+Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.
+The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.
+Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
+Action 3: Finish[The Saimaa Gesture]
+
+Question: What profession does Nicholas Ray and Elia Kazan have in common?
+Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.
+Action 1: Search[Nicholas Ray]
+Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.
+Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.
+Action 2: Search[Elia Kazan]
+Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.
+Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
+Action 3: Finish[director, screenwriter, actor]
+
+Question: Which magazine was started first Arthur's Magazine or First for Women?
+Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur's Magazine]
+Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. 
+Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. 
+Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
+Action 3: Finish[Arthur's Magazine]
+
+Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
+Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.
+Action 1: Search[Pavel Urysohn]
+Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
+Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
+Action 2: Search[Leonid Levin]
+Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. 
+Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. 
+Action 3: Finish[yes]"""
+
+REFLECTIONS = """
+Previous Trial:
+Question: The Rome Protocols were signed by three Prime Ministers one of which was assassinated as part of what?
+Thought 1: I need to search Rome Protocols, find the three Prime Ministers, then find what they were assassinated as part of.
+Action 1: Search[Rome Protocols]
+Observation 1: The Rome Protocols were a series of three international agreements concluded in Rome on 17 March 1934 between the governments of Austria, Hungary and Italy. They were signed by Italian Prime Minister Benito Mussolini, Austrian Chancellor Engelbert Dollfuss and Hungarian Prime Minister Gyula Gömbös. All of the protocols went into effect on 12 July 1934 and were registered in League of Nations Treaty Series on 12 December 1934.
+Thought 2: The three Prime Ministers were Benito Mussolini, Engelbert Dollfuss, and Gyula Gömbös. I need to search Benito Mussolini and find what he was assassinated as part of.
+Action 2: Search[Benito Mussolini]
+Observation 2: Benito Amilcare Andrea Mussolini (UK:  MU(U)SS-ə-LEE-nee, US:  MOOSS-, Italian: [beˈniːto aˈmilkare anˈdrɛːa mussoˈliːni]; 29 July 1883 – 28 April 1945) was an Italian politician and journalist who founded and led the National Fascist Party (PNF). He was Prime Minister of Italy from the March on Rome in 1922 until his deposition in 1943, as well as "Duce" of Italian fascism from the establishment of the Italian Fasces of Combat in 1919 until his summary execution in 1945 by Italian partisans. As dictator of Italy and principal founder of fascism, Mussolini inspired and supported the international spread of fascist movements during the inter-war period.Mussolini was originally a socialist politician and a journalist at the Avanti! newspaper. In 1912, he became a member of the National Directorate of the Italian Socialist Party (PSI), but he was expelled from the PSI for advocating military intervention in World War I, in opposition to the party's stance on neutrality. In 1914, Mussolini founded a new journal, Il Popolo d'Italia, and served in the Royal Italian Army during the war until he was wounded and discharged in 1917. Mussolini denounced the PSI, his views now centering on Italian nationalism instead of socialism, and later founded the fascist movement which came to oppose egalitarianism and class conflict, instead advocating "revolutionary nationalism" transcending class lines. On 31 October 1922, following the March on Rome (28–30 October), Mussolini was appointed prime minister by King Victor Emmanuel III, becoming the youngest individual to hold the office up to that time. After removing all political opposition through his secret police and outlawing labor strikes, Mussolini and his followers consolidated power through a series of laws that transformed the nation into a one-party dictatorship. Within five years, Mussolini had established dictatorial authority by both legal and illegal means and aspired to create a totalitarian state. In 1929, Mussolini signed the Lateran Treaty with the Holy See to establish Vatican City.
+Mussolini's foreign policy aimed to restore the ancient grandeur of the Roman Empire by expanding Italian colonial possessions and the fascist sphere of influence. In the 1920s, he ordered the Pacification of Libya, instructed the bombing of Corfu over an incident with Greece, established a protectorate over Albania, and incorporated the city of Fiume into the Italian state via agreements with Yugoslavia. In 1936, Ethiopia was conquered following the Second Italo-Ethiopian War and merged into Italian East Africa (AOI) with Eritrea and Somalia. In 1939, Italian forces annexed Albania. Between 1936 and 1939, Mussolini ordered the successful Italian military intervention in Spain in favor of Francisco Franco during the Spanish Civil War. Mussolini's Italy initially tried to avoid the outbreak of a second global war, sending troops at the Brenner Pass to delay Anschluss and taking part in the Stresa Front, the Lytton Report, the Treaty of Lausanne, the Four-Power Pact and the Munich Agreement. However, Italy then alienated itself from Britain and France by aligning with Germany and Japan. Germany invaded Poland on 1 September 1939, resulting in declarations of war by France and the UK and the start of World War II.
+On 10 June 1940, Mussolini decided to enter the war on the Axis side. Despite initial success, the subsequent Axis collapse on multiple fronts and eventual Allied invasion of Sicily made Mussolini lose the support of the population and members of the Fascist Party. As a consequence, early on 25 July 1943, the Grand Council of Fascism passed a motion of no confidence in Mussolini; later that day King Victor Emmanuel III dismissed him as head of government and had him placed in custody, appointing Pietro Badoglio to succeed him as Prime Minister. After the king agreed to an armistice with the Allies, on 12 September 1943 Mussolini was rescued from captivity in the Gran Sasso raid by German paratroopers and Waffen-SS commandos led by Major Otto-Harald Mors. Adolf Hitler, after meeting with the rescued former dictator, then put Mussolini in charge of a puppet regime in northern Italy, the Italian Social Republic (Italian: Repubblica Sociale Italiana, RSI), informally known as the Salò Republic, causing a civil war. In late April 1945, in the wake of near total defeat, Mussolini and his mistress Clara Petacci attempted to flee to Switzerland, but both were captured by Italian communist partisans and summarily executed by firing squad on 28 April 1945 near Lake Como. The bodies of Mussolini and his mistress were then taken to Milan, where they were hung upside down at a service station to publicly confirm their demise.
+Thought 3: Benito Mussolini was assassinated as part of the collapse of the Axis powers during World War II. So the Rome Protocols were signed by three Prime Ministers one of which was assassinated as part of World War II.
+Action 3: Finish[World War II]
+
+Reflection: I searched one of the prime ministers involved in the signing, then attemted to answer right away. I should have searched each of the prime ministers, then looked up 'death' on each of their pages in order to get more information before answering.
+
+Previous Trial:
+Question: Kam Heskin plays Paige Morgan in a 2004 film directed by who?
+Thought 1: I need to search Kam Heskin and Paige Morgan, find the 2004 film they are in, then find the director of the film.
+Action 1: Search[Kam Heskin]
+Observation 1: Kam Erika Heskin (born May 8, 1973) is an American actress. She began her career playing Caitlin Deschanel on the NBC daytime soap opera Sunset Beach (1998–1999), before appearing in films Planet of the Apes (2001) and Catch Me If You Can (2002). Heskin went on in 2003 to play Elizabeth Bennet in an independent film Pride & Prejudice: A Latter-Day Comedy, Robin in Charmed (2004),  and Paige Morgan in The Prince and Me film franchise (2006–2010).
+Thought 2: Kam Heskin plays Paige Morgan in The Prince and Me film franchise. I need to search The Prince and Me film franchise and find the director of the 2004 film.
+Action 2: Search[The Prince and Me film franchise]
+Observation 2: Could not find [The Prince and Me film franchise]. Similar: ['Kam Heskin', 'Prince of Persia', 'Harry Potter and the Half-Blood Prince (film)', 'Prince of Persia: The Sands of Time (film)', 'The Chronicles of Narnia (film series)', 'Majestic Prince (franchise)', 'Kingsman (franchise)', 'The Little Prince (2015 film)', 'Shrek (franchise)', 'Harry Potter (film series)']
+Thought 3: I need to search The Prince and Me (2004 film) instead.
+Action 3: Search[The Prince and Me (2004 film)]
+Observation 3: Could not find [The Prince and Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Kam Heskin', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince of Persia', 'Brooklynn Prince', 'Prince (musician)']
+Thought 4: I need to search The Prince & Me (2004 film).
+Action 4: Search[The Prince & Me (2004 film)]
+Observation 4: Could not find [The Prince & Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince (musician)', 'Prince of Persia', 'Kam Heskin', 'Brooklynn Prince']
+Thought 5: I need to search The Prince & Me (2004 film) instead.
+Action 5: Search[The Prince & Me (2004 film)]
+Observation 5: Could not find [The Prince & Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince (musician)', 'Prince of Persia', 'Kam Heskin', 'Brooklynn Prince']
+Thought 6: I need to search The Prince & Me (2004 film).
+Action 6: Search[The Prince & Me (2004 film)]
+Observation 6: Could not find [The Prince & Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince (musician)', 'Prince of Persia', 'Kam Heskin', 'Brooklynn Prince']
+
+Reflection: I got stuck in a loop where I kept trying to search 'The Prince & Me (2004 film)' but the page could not be found. Instead I should have tried to search the similar results that had a similar name to see and they were made in 2004.
+"""
+COTQA_SIMPLE6 = """
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought: Let's think step by step. The eastern sector of Colorado orogeny extends into the High Plains. High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
+Action: Finish[1,800 to 7,000 ft]
+
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought: Let's think step by step. Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
+Action: Finish[Richard Nixon]
+
+Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
+Thought: Let's think step by step. Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
+Action: Finish[The Saimaa Gesture]
+
+Question: What profession does Nicholas Ray and Elia Kazan have in common?
+Thought: Let's think step by step. Professions of Nicholas Ray are director, screenwriter, and actor. Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
+Action: Finish[director, screenwriter, actor]
+
+Question: Which magazine was started first Arthur's Magazine or First for Women?
+Thought: Let's think step by step. Arthur's Magazine was started in 1844. First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
+Action: Finish[Arthur's Magazine]
+
+Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
+Thought: Let's think step by step. Pavel Urysohn is a mathematician. Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
+Action: Finish[Yes]"""
+
+COT_SIMPLE_REFLECTION = """
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought: Let's think step by step. The eastern sector of Colorado orogeny extends into the Rocky Mountains. The Rocky Mountains rise in elevation from around 1,800 to 14,000 ft, so the answer is 1,800 to 14,000 ft.
+Action: Finish[1,800 to 14,000 ft]
+
+Reflection: My reasoning for the elevation range of the eastern sector of the Colorado orogeny failed because I incorrectly associated it with the Rocky Mountains instead of the High Plains. In the future, when attempting this question, I should focus on better understanding the geography and geological formations involved in the Colorado orogeny to avoid confusion.
+
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought: Let's think step by step. Milhouse was named after a prominent cartoon character, Mickey Mouse, so the answer is Mickey Mouse.
+Action: Finish[Mickey Mouse]
+
+Reflection: My reasoning for the naming of the character Milhouse in "The Simpsons" failed because I incorrectly assumed that the character was named after a prominent cartoon character, Mickey Mouse. In the future, when attempting this question, I should focus on better researching the background of "The Simpsons" and Matt Groening's influences in naming characters to avoid making incorrect assumptions. 
+"""
+
+COT = """Relevant Context: The Nile River is the longest river in the world, spanning approximately 6,650 kilometers (4,132 miles) in length. It flows through eleven countries in northeastern Africa, including Egypt, Sudan, and Uganda.
+Question: What is the longest river in the world?
+Thought: The question asks for the longest river in the world, which I know is the Nile River based on the context provided.
+Action: Finish[Nile River]
+
+Relevant Context: Ludwig van Beethoven was a German composer and pianist who was a crucial figure in the transition between the Classical and Romantic eras in Western classical music. One of his most famous compositions is the Symphony No. 9, also known as the "Choral" symphony.
+Question: Which composer created Symphony No. 9?
+Thought: The question is asking for the composer of Symphony No. 9. Based on the context, I know that Ludwig van Beethoven composed this symphony.
+Action: Finish[Ludwig van Beethoven]
+
+Relevant Context: Photosynthesis is the process by which green plants and some other organisms convert light energy into chemical energy. During this process, plants absorb sunlight, carbon dioxide, and water to produce glucose and oxygen.
+Question: What do plants produce during photosynthesis?
+Thought: The question is asking about the products of photosynthesis. From the context, I know that plants produce glucose and oxygen during this process.
+Action: Finish[Glucose and Oxygen]
+"""
+
+COT_REFLECT = """
+Relevant Context: Ernest Hemingway's novel "The Old Man and the Sea" tells the story of Santiago, an aging Cuban fisherman, who struggles to catch a giant marlin in the Gulf Stream. The book won the Pulitzer Prize for Fiction in 1953 and contributed to Hemingway's Nobel Prize for Literature in 1954.
+Question: Which literary award did "The Old Man and the Sea" contribute to Hemingway winning?
+Thought: The question is asking which award "The Old Man and the Sea" contributed to Hemingway winning. Based on the context, I know the novel won the Pulitzer Prize for Fiction and contributed to his Nobel Prize for Literature.
+Action: Finish[Pulitzer Prize for Fiction]
+
+Reflection: My answer was correct based on the context, but may not be the exact answer stored by the grading environment. Next time, I should try to provide a less verbose answer like "Pulitzer Prize" or "Nobel Prize."
+
+Context: On 14 October 1947, Chuck Yeager, a United States Air Force test pilot, became the first person to break the sound barrier by flying the Bell X-1 experimental aircraft at an altitude of 45,000 feet.
+Charles Elwood "Chuck" Yeager (13 February 1923 - 7 December 2020) was a United States Air Force officer, flying ace, and test pilot. He is best known for becoming the first person to break the sound barrier, which he achieved in the Bell X-1 aircraft named Glamorous Glennis. Yeager was also a distinguished fighter pilot during World War II and was credited with shooting down at least 12 enemy aircraft. In 1973, he was inducted into the National Aviation Hall of Fame for his significant contributions to aviation.
+Question: Who is the first person to break the sound barrier?
+Thought: The question is asking for the first person to break the sound barrier. From the context, I know that Chuck Yeager, a United States Air Force test pilot, was the first person to break the sound barrier.
+Action: Finish[Chuck Yeager]
+
+Reflection: Upon reflecting on the incorrect answer I provided, I realize that I may not have given the full name of the individual in question. In the context, both the given name and the nickname were mentioned, and I only used the nickname in my response. This could have been the reason my answer was deemed incorrect. Moving forward, when attempting this question again or similar questions, I will make sure to include the complete name of the person, which consists of their given name, any middle names, and their nickname (if applicable). This will help ensure that my answer is more accurate and comprehensive."""
+
+COT_REFLECT2 = """Relevant Context: The novel "To Kill a Mockingbird" was written by Harper Lee and published in 1960. The story takes place in the fictional town of Maycomb, Alabama during the Great Depression. The main characters are Scout Finch, her brother Jem, and their father Atticus Finch, a lawyer.
+Question: Where does "To Kill a Mockingbird" take place?
+Thought: The question is asking for the setting of "To Kill a Mockingbird." Based on the context, I know that the story takes place in the fictional town of Maycomb, Alabama.
+Action: Finish[Mississippi]
+
+Reflection: I made an error in my response, as I incorrectly stated that "To Kill a Mockingbird" takes place in Mississippi. Upon reviewing the context, I realized that the correct answer is the fictional town of Maycomb, Alabama. I may have been confused due to the story's Southern setting. Next time, I should be more cautious and double-check the context before providing an answer.
+
+Relevant Context: Sir Isaac Newton formulated the three laws of motion that are fundamental to classical mechanics. These laws describe the relationship between the motion of an object and the forces acting upon it. They are known as Newton's First Law, Newton's Second Law, and Newton's Third Law.
+Question: Who formulated the three laws of motion?
+Thought: The question is asking for the person who formulated the three laws of motion. Based on the context, I know that Sir Isaac Newton formulated these laws.
+Action: Finish[Albert Einstein]
+
+Reflection: My response was incorrect, as I stated that Albert Einstein formulated the three laws of motion, when the correct answer is Sir Isaac Newton. I may have confused the two scientists due to their significant contributions to the field of physics. In the future, I should be more careful in distinguishing between different scientists and their specific accomplishments before answering a question.
+
+Relevant Context: The Great Wall of China is a series of fortifications made of stone, brick, tamped earth, wood, and other materials. It was built primarily to protect the Chinese Empire from invasions by various nomadic groups. The wall stretches over 13,000 miles (21,196 kilometers) and was constructed by several Chinese dynasties.
+Question: What is the primary purpose of the Great Wall of China?
+Thought: The question is asking about the primary purpose of the Great Wall of China. From the context, I know that it was built primarily to protect the Chinese Empire from invasions by various nomadic groups.
+Action: Finish[Trade]
+
+Reflection: My answer was incorrect, as I stated that the primary purpose of the Great Wall of China was for trade. After reevaluating the context, I realized that the wall was built primarily to protect the Chinese Empire from invasions by various nomadic groups. My confusion may have stemmed from the fact that the wall is a significant historical landmark that may have played a role in trade routes. Next time, I will focus on the key details provided in the context to ensure the accuracy of my response.
+"""
--- a/hotpotqa_runs/mocks.py
+++ b/hotpotqa_runs/mocks.py
@ -0,0 +1,43 @@
+from langchain.agents.react.base import DocstoreExplorer
+from langchain.llms.base import BaseLLM
+
+def reactLLMMock(prompt: str) -> str:
+    last_line = prompt.split('\n')[-1].strip()
+    last_action = last_line.split(' ')[0].lower()
+    if last_action == 'thought':
+        return 'It does not mention the eastern sector. So I need to look up eastern sector.'
+    elif last_action == 'action':
+        return 'Lookup[eastern sector]'
+    else:
+        raise Exception('Invalid action type')
+
+
+def reflectLLMMock(prompt: str) -> str:
+    return "Last time i should have answered correctly"
+
+class LLMMock(BaseLLM):
+    def __init__(self):
+        ...
+    
+    def __call__(self, prompt: str) -> str:
+        if prompt.split('\n')[0].split(' ')[0] == 'Solve':
+            return reactLLMMock(prompt)
+        
+        elif prompt.split('\n')[0].split(' ')[0] == 'You':
+            return reflectLLMMock(prompt)
+        else:
+            raise Exception("Invalid LLM prompt")
+    
+    def get_num_tokens(self, text: str) -> int:
+        return 0
+    
+class DocStoreExplorerMock(DocstoreExplorer):
+    def __init__(self):
+        self.summary = "The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas."
+        self.body = "(Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny."
+    
+    def search(self, search: str, sents: int = 5) -> str:
+        return self.summary
+    
+    def lookup(self, term: str) -> str:
+        return self.body
--- a/hotpotqa_runs/prompts.py
+++ b/hotpotqa_runs/prompts.py
@ -0,0 +1,142 @@
+from langchain.prompts import PromptTemplate
+
+COT_INSTRUCTION = """Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task. You will be given context that you should use to help you answer the question.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+{reflections}
+Relevant Context: {context} 
+Question: {question}{scratchpad}"""
+
+COT_AGENT_REFLECT_INSTRUCTION = """Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task. You will be given context that you should use to help you answer the question.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+Relevant Context: {context}
+Question: {question}{scratchpad}"""
+
+COT_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given access to relevant context and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, Diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.  
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+Relevant Context: {context}
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+cot_agent_prompt = PromptTemplate(
+                        input_variables=["examples", "reflections", "context", "question", "scratchpad"],
+                        template = COT_INSTRUCTION,
+                        )
+
+cot_reflect_agent_prompt = PromptTemplate(
+                        input_variables=["examples", "reflections", "context", "question", "scratchpad"],
+                        template = COT_AGENT_REFLECT_INSTRUCTION,
+                        )
+
+cot_reflect_prompt = PromptTemplate(
+                        input_variables=["examples", "context", "question", "scratchpad"],
+                        template = COT_REFLECT_INSTRUCTION,
+                        )
+
+COT_SIMPLE_INSTRUCTION = """Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+{reflections}
+{context}
+Question: {question}{scratchpad}"""
+
+COT_SIMPLE_AGENT_REFLECT_INSTRUCTION = """Solve a question answering task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+{context}
+{reflections}
+
+Question: {question}{scratchpad}"""
+
+COT_SIMPLE_REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, Diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+{context}
+Previous trial:
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+cot_simple_agent_prompt = PromptTemplate(
+                        input_variables=["examples", "question", "reflections", "context", "scratchpad"],
+                        template = COT_SIMPLE_INSTRUCTION,
+                        )
+
+cot_simple_reflect_agent_prompt = PromptTemplate(
+                        input_variables=["examples", "context", "reflections", "question", "scratchpad"],
+                        template = COT_SIMPLE_AGENT_REFLECT_INSTRUCTION,
+                        )
+
+cot_simple_reflect_prompt = PromptTemplate(
+                        input_variables=["examples", "question", "context", "scratchpad"],
+                        template = COT_SIMPLE_REFLECT_INSTRUCTION,
+                        )
+
+
+REACT_INSTRUCTION = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the last passage successfully found by Search.
+(3) Finish[answer], which returns the answer and finishes the task.
+You may take as many steps as necessary.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+Question: {question}{scratchpad}"""
+
+REACT_REFLECT_INSTRUCTION = """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types: 
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the last passage successfully found by Search.
+(3) Finish[answer], which returns the answer and finishes the task.
+You may take as many steps as necessary.
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+Question: {question}{scratchpad}"""
+
+REFLECTION_HEADER = 'You have attempted to answer following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question.\n'
+REFLECTION_AFTER_LAST_TRIAL_HEADER = 'The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question.\n'
+LAST_TRIAL_HEADER = 'You have attempted to answer the following question before and failed. Below is the last trial you attempted to answer the question.\n'
+
+REFLECT_INSTRUCTION = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given access to an Docstore API environment and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>], or you used up your set number of reasoning steps. In a few sentences, Diagnose a possible reason for failure and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.  
+Here are some examples:
+{examples}
+
+Previous trial:
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+react_agent_prompt = PromptTemplate(
+                        input_variables=["examples", "question", "scratchpad"],
+                        template = REACT_INSTRUCTION,
+                        )
+
+react_reflect_agent_prompt = PromptTemplate(
+                        input_variables=["examples", "reflections", "question", "scratchpad"],
+                        template = REACT_REFLECT_INSTRUCTION,
+                        )
+
+reflect_prompt = PromptTemplate(
+                        input_variables=["examples", "question", "scratchpad"],
+                        template = REFLECT_INSTRUCTION,
+                        )
+
+
+
--- a/hotpotqa_runs/react_cls.py
+++ b/hotpotqa_runs/react_cls.py
@ -0,0 +1,379 @@
+import re, string, os
+from typing import List, Union, Literal
+
+import tiktoken
+from langchain import OpenAI, Wikipedia
+from langchain.llms.base import BaseLLM
+from langchain.agents.react.base import DocstoreExplorer
+from langchain.docstore.base import Docstore
+from langchain.prompts import PromptTemplate
+from prompts import reflect_prompt, react_agent_prompt, react_reflect_agent_prompt, REFLECTION_HEADER, LAST_TRIAL_HEADER, REFLECTION_AFTER_LAST_TRIAL_HEADER
+from prompts import cot_agent_prompt, cot_reflect_agent_prompt, cot_reflect_prompt, COT_INSTRUCTION, COT_REFLECT_INSTRUCTION
+from fewshots import WEBTHINK_SIMPLE6, REFLECTIONS, COT, COT_REFLECT
+
+class CoTAgent:
+    def __init__(self,
+                    question: str,
+                    context: str,
+                    key: str,
+                    agent_prompt: PromptTemplate = cot_reflect_agent_prompt,
+                    reflect_prompt: PromptTemplate = cot_reflect_prompt,
+                    reflect_header: str = REFLECTION_HEADER,
+                    cot_examples: str = COT,
+                    reflect_examples: str = COT_REFLECT,
+                    self_reflect_llm: BaseLLM = OpenAI(
+                                            temperature=0,
+                                            max_tokens=250,
+                                            model_name="text-davinci-003",
+                                            model_kwargs={"stop": "\n"},
+                                            openai_api_key=os.environ['OPENAI_API_KEY']),
+                    action_llm: BaseLLM = OpenAI(
+                                            temperature=0,
+                                            max_tokens=250,
+                                            model_name="text-davinci-003",
+                                            model_kwargs={"stop": "\n"},
+                                            openai_api_key=os.environ['OPENAI_API_KEY']),
+                    ) -> None:
+        
+        self.question = question
+        self.context = context
+        self.key = key
+        self.agent_prompt = agent_prompt
+        self.reflect_prompt = reflect_prompt
+        self.reflect_header = reflect_header
+        self.cot_examples = cot_examples 
+        self.reflect_examples = reflect_examples
+        self.self_reflect_llm = self_reflect_llm
+        self.action_llm = action_llm
+        self.reflections: List[str] = []
+        self.reflections_str = ''
+        self.answer = ''
+        self.step_n: int = 0
+        self.reset()
+
+    def run(self, reflect: bool = True,
+            reflect_strategy: Union[Literal['last_attempt'],
+                                    Literal['reflexion'],
+                                    Literal['last_attempt + reflexion']] = 'reflexion') -> None:
+        if self.step_n > 0 and not self.is_correct() and reflect:
+            self.reflect(reflect_strategy)
+        self.reset()
+        self.step()
+        self.step_n += 1
+
+    def step(self) -> None:
+        # Think
+        self.scratchpad += f'\nThought:'
+        self.scratchpad += ' ' + self.prompt_agent()
+        print(self.scratchpad.split('\n')[-1])
+
+        # Act
+        self.scratchpad += f'\nAction:'
+        action = self.prompt_agent()
+        self.scratchpad += ' ' + action
+        action_type, argument = parse_action(action)
+        print(self.scratchpad.split('\n')[-1])  
+
+        self.scratchpad += f'\nObservation: '
+        if action_type == 'Finish':
+            self.answer = argument
+            if self.is_correct():
+                self.scratchpad += 'Answer is CORRECT'
+            else: 
+                self.scratchpad += 'Answer is INCORRECT'
+            self.finished = True
+            return
+        else:
+            print('Invalid action type, please try again.')
+    
+    def reflect(self,
+                strategy: Union[Literal['last_attempt'],
+                                Literal['reflexion'],
+                                Literal['last_attempt + reflexion']]) -> None:
+        print('Reflecting...')
+        if strategy == 'last_attempt':
+            self.reflections = [self.scratchpad]
+            self.reflections_str = format_last_attempt(self.question , self.reflections[0])
+        elif strategy == 'reflexion':
+            self.reflections += [self.prompt_reflection()]
+            self.reflections_str = format_reflections(self.reflections)
+        elif strategy == 'last_attempt + reflexion':
+            self.reflections_str = format_last_attempt(self.question , self.scratchpad)
+            self.reflections = [self.prompt_reflection()]
+            self.reflections_str += '\n'+ format_reflections(self.reflections, header = REFLECTION_AFTER_LAST_TRIAL_HEADER)
+        else:
+            raise NotImplementedError(f'Unknown reflection strategy: {strategy}')
+        print(self.reflections_str)
+    
+    def prompt_reflection(self) -> str:
+        return format_step(self.self_reflect_llm(self._build_reflection_prompt()))
+
+    def reset(self) -> None:
+        
+        self.scratchpad: str = ''
+        self.finished = False
+
+    def prompt_agent(self) -> str:
+        return format_step(self.action_llm(self._build_agent_prompt()))
+    
+    def _build_agent_prompt(self) -> str:
+        return self.agent_prompt.format(
+                            examples = self.cot_examples,
+                            reflections = self.reflections_str,
+                            context = self.context,
+                            question = self.question,
+                            scratchpad = self.scratchpad)
+
+    def _build_reflection_prompt(self) -> str:
+        return self.reflect_prompt.format(
+                            examples = self.reflect_examples,
+                            context = self.context,
+                            question = self.question,
+                            scratchpad = self.scratchpad)
+ 
+    def is_finished(self) -> bool:
+        return self.finished
+
+    def is_correct(self) -> bool:
+        return EM(self.answer, self.key)   
+
+class ReactAgent:
+    def __init__(self,
+                 question: str,
+                 key: str,
+                 max_steps: int = 6,
+                 agent_prompt: PromptTemplate = react_agent_prompt,
+                 docstore: Docstore = Wikipedia(),
+                 react_llm: BaseLLM = OpenAI(
+                                            temperature=0,
+                                            max_tokens=100,
+                                            model_name="text-davinci-003",
+                                            model_kwargs={"stop": "\n"},
+                                            openai_api_key=os.environ['OPENAI_API_KEY']),
+                 ) -> None:
+        
+        self.question = question
+        self.answer = ''
+        self.key = key
+        self.max_steps = max_steps
+        self.agent_prompt = agent_prompt
+        self.react_examples = WEBTHINK_SIMPLE6
+
+        self.docstore = DocstoreExplorer(docstore) # Search, Lookup
+        self.llm = react_llm
+        
+        self.enc = tiktoken.encoding_for_model("text-davinci-003")
+
+        self.__reset_agent()
+
+    def run(self, reset = True) -> None:
+        if reset:
+            self.__reset_agent()
+        
+        while not self.is_halted() and not self.is_finished():
+            self.step()
+    
+    def step(self) -> None:
+        # Think
+        self.scratchpad += f'\nThought {self.step_n}:'
+        self.scratchpad += ' ' + self.prompt_agent()
+        print(self.scratchpad.split('\n')[-1])
+
+        # Act
+        self.scratchpad += f'\nAction {self.step_n}:'
+        action = self.prompt_agent()
+        self.scratchpad += ' ' + action
+        action_type, argument = parse_action(action)
+        print(self.scratchpad.split('\n')[-1])
+
+        # Observe
+        self.scratchpad += f'\nObservation {self.step_n}: '
+        
+        if action_type == 'Finish':
+            self.answer = argument
+            if self.is_correct():
+                self.scratchpad += 'Answer is CORRECT'
+            else: 
+                self.scratchpad += 'Answer is INCORRECT'
+            self.finished = True
+            self.step_n += 1
+            return
+
+        if action_type == 'Search':
+            try:
+                self.scratchpad += format_step(self.docstore.search(argument))
+            except Exception as e:
+                print(e)
+                self.scratchpad += f'Could not find that page, please try again.'
+            
+        elif action_type == 'Lookup':
+            try:
+                self.scratchpad += format_step(self.docstore.lookup(argument))
+            except ValueError:
+                self.scratchpad += f'The last page Searched was not found, so you cannot Lookup a keyword in it. Please try one of the similar pages given.'
+
+        else:
+            self.scratchpad += 'Invalid Action. Valid Actions are Lookup[<topic>] Search[<topic>] and Finish[<answer>].'
+
+        print(self.scratchpad.split('\n')[-1])
+
+        self.step_n += 1
+
+    def prompt_agent(self) -> str:
+        return format_step(self.llm(self._build_agent_prompt()))
+    
+    def _build_agent_prompt(self) -> str:
+        return self.agent_prompt.format(
+                            examples = self.react_examples,
+                            question = self.question,
+                            scratchpad = self.scratchpad)
+    
+    def is_finished(self) -> bool:
+        return self.finished
+
+    def is_correct(self) -> bool:
+        return EM(self.answer, self.key)
+
+    def is_halted(self) -> bool:
+        return ((self.step_n > self.max_steps) or (len(self.enc.encode(self._build_agent_prompt())) > 3896)) and not self.finished
+
+    def __reset_agent(self) -> None:
+        self.step_n = 1
+        self.finished = False
+        self.scratchpad: str = ''
+
+    def set_qa(self, question: str, key: str) -> None:
+        self.question = question
+        self.key = key
+
+class ReactReflectAgent(ReactAgent):
+    def __init__(self,
+                 question: str,
+                 key: str,
+                 max_steps: int = 6,
+                 agent_prompt: PromptTemplate = react_reflect_agent_prompt,
+                 reflect_prompt: PromptTemplate = reflect_prompt,
+                 reflect_header: str = REFLECTION_HEADER,
+                 docstore: Docstore = Wikipedia(),
+                 react_llm: BaseLLM = OpenAI(
+                                             temperature=0,
+                                             max_tokens=100,
+                                             model_name="text-davinci-003",
+                                             model_kwargs={"stop": "\n"},
+                                             openai_api_key=os.environ['OPENAI_API_KEY']),
+                 reflect_llm: BaseLLM = OpenAI(
+                                               temperature=0,
+                                               max_tokens=250,
+                                               model_name="text-davinci-003",
+                                               openai_api_key=os.environ['OPENAI_API_KEY']),
+                 ) -> None:
+        
+        super().__init__(question, key, max_steps, agent_prompt, docstore, react_llm)
+        self.reflect_header = reflect_header
+        self.reflect_llm = reflect_llm
+        self.reflect_prompt = reflect_prompt
+        self.reflect_examples = REFLECTIONS
+        self.reflections: List[str] = []
+        self.reflections_str: str = ''
+    
+    def run(self, reset = True, reflect_strategy: Union[Literal['last_attempt'], Literal['reflexion'], Literal['last_attempt + reflexion']] = 'reflexion') -> None:
+        if (self.is_finished() or self.is_halted()) and not self.is_correct():
+            self.reflect(reflect_strategy)
+
+        ReactAgent.run(self, reset)
+    
+    def reflect(self,
+                strategy: Union[Literal['last_attempt'], Literal['reflexion'], Literal['last_attempt + reflexion']]) -> None:
+        print('Reflecting...')
+        if strategy == 'last_attempt':
+            self.reflections = [self.scratchpad]
+            self.reflections_str = format_last_attempt(self.question, self.reflections[0])
+        elif strategy == 'reflexion':
+            self.reflections += [self.prompt_reflection()]
+            self.reflections_str = format_reflections(self.reflections)
+        elif strategy == 'last_attempt + reflexion':
+            self.reflections_str = format_last_attempt(self.question, self.scratchpad)
+            self.reflections = [self.prompt_reflection()]
+            self.reflections_str += format_reflections(self.reflections, header = REFLECTION_AFTER_LAST_TRIAL_HEADER)
+        else:
+            raise NotImplementedError(f'Unknown reflection strategy: {strategy}')
+        print(self.reflections_str)
+    
+    def prompt_reflection(self) -> str:
+        return format_step(self.reflect_llm(self._build_reflection_prompt()))
+
+    def _build_reflection_prompt(self) -> str:
+        return self.reflect_prompt.format(
+                            examples = self.reflect_examples,
+                            question = self.question,
+                            scratchpad = truncate_scratchpad(self.scratchpad, tokenizer=self.enc))
+ 
+    def _build_agent_prompt(self) -> str:
+        return self.agent_prompt.format(
+                            examples = self.react_examples,
+                            reflections = self.reflections_str,
+                            question = self.question,
+                            scratchpad = self.scratchpad)
+   
+
+### String Stuff ###
+gpt2_enc = tiktoken.encoding_for_model("text-davinci-003")
+
+def parse_action(string):
+    pattern = r'^(\w+)\[(.+)\]$'
+    match = re.match(pattern, string)
+    
+    if match:
+        action_type = match.group(1)
+        argument = match.group(2)
+        return action_type, argument
+    
+    else:
+        return None
+
+def format_step(step: str) -> str:
+    return step.strip('\n').strip().replace('\n', '')
+
+def format_reflections(reflections: List[str],
+                        header: str = REFLECTION_HEADER) -> str:
+    if reflections == []:
+        return ''
+    else:
+        return header + 'Reflections:\n- ' + '\n- '.join([r.strip() for r in reflections])
+
+def format_last_attempt(question: str,
+                        scratchpad: str,
+                        header: str = LAST_TRIAL_HEADER):
+    return header + f'Question: {question}\n' + truncate_scratchpad(scratchpad, tokenizer=gpt2_enc).strip('\n').strip() + '\n(END PREVIOUS TRIAL)\n'
+
+def truncate_scratchpad(scratchpad: str, n_tokens: int = 1600, tokenizer = gpt2_enc) -> str:
+    lines = scratchpad.split('\n')
+    observations = filter(lambda x: x.startswith('Observation'), lines)
+    observations_by_tokens = sorted(observations, key=lambda x: len(tokenizer.encode(x)))
+    while len(gpt2_enc.encode('\n'.join(lines))) > n_tokens:
+        largest_observation = observations_by_tokens.pop(-1)
+        ind = lines.index(largest_observation)
+        lines[ind] = largest_observation.split(':')[0] + ': [truncated wikipedia excerpt]'
+    return '\n'.join(lines)
+
+def normalize_answer(s):
+  def remove_articles(text):
+    return re.sub(r"\b(a|an|the)\b", " ", text)
+  
+  def white_space_fix(text):
+      return " ".join(text.split())
+
+  def remove_punc(text):
+      exclude = set(string.punctuation)
+      return "".join(ch for ch in text if ch not in exclude)
+
+  def lower(text):
+      return text.lower()
+
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def EM(answer, key) -> bool:
+    return normalize_answer(answer) == normalize_answer(key)
+
+
+
--- a/hotpotqa_runs/requirements.txt
+++ b/hotpotqa_runs/requirements.txt
@ -0,0 +1,12 @@
+EdgeGPT==0.3.6
+gym==0.26.2
+joblib==1.2.0
+langchain==0.0.162
+numpy==1.24.1
+openai==0.27.4
+python-dotenv==1.0.0
+tenacity==8.2.2
+tiktoken==0.4.0
+transformers==4.28.1
+pandas==1.5.3
+scikit-learn
--- a/hotpotqa_runs/root/.DS_Store
+++ b/hotpotqa_runs/root/.DS_Store
--- a/hotpotqa_runs/root/appendix.txt
+++ b/hotpotqa_runs/root/appendix.txt
@ -0,0 +1,36 @@
+Previous Trial:
+Question: The Rome Protocols were signed by three Prime Ministers one of which was assassinated as part of what?
+Thought 1: I need to search Rome Protocols, find the three Prime Ministers, then find what they were assassinated as part of.
+Action 1: Search[Rome Protocols]
+Observation 1: The Rome Protocols were a series of three international agreements concluded in Rome on 17 March 1934 between the governments of Austria, Hungary and Italy. They were signed by Italian Prime Minister Benito Mussolini, Austrian Chancellor Engelbert Dollfuss and Hungarian Prime Minister Gyula Gömbös. All of the protocols went into effect on 12 July 1934 and were registered in League of Nations Treaty Series on 12 December 1934.
+Thought 2: The three Prime Ministers were Benito Mussolini, Engelbert Dollfuss, and Gyula Gömbös. I need to search Benito Mussolini and find what he was assassinated as part of.
+Action 2: Search[Benito Mussolini]
+Observation 2: Benito Amilcare Andrea Mussolini (UK:  MU(U)SS-ə-LEE-nee, US:  MOOSS-, Italian: [beˈniːto aˈmilkare anˈdrɛːa mussoˈliːni]; 29 July 1883 – 28 April 1945) was an Italian politician and journalist who founded and led the National Fascist Party (PNF). He was Prime Minister of Italy from the March on Rome in 1922 until his deposition in 1943, as well as "Duce" of Italian fascism from the establishment of the Italian Fasces of Combat in 1919 until his summary execution in 1945 by Italian partisans...
+Thought 3: Benito Mussolini was assassinated as part of the collapse of the Axis powers during World War II. So the Rome Protocols were signed by three Prime Ministers one of which was assassinated as part of World War II.
+Action 3: Finish[World War II]
+
+Reflection: I searched one of the prime ministers involved in the signing, then attemted to answer right away. I should have searched each of the prime ministers, then looked up 'death' on each of their pages in order to get more information before answering.
+
+
+Previous Trial:
+Question: Kam Heskin plays Paige Morgan in a 2004 film directed by who?
+Thought 1: I need to search Kam Heskin and Paige Morgan, find the 2004 film they are in, then find the director of the film.
+Action 1: Search[Kam Heskin]
+Observation 1: Kam Erika Heskin (born May 8, 1973) is an American actress. She began her career playing Caitlin Deschanel on the NBC daytime soap opera Sunset Beach (1998–1999), before appearing in films Planet of the Apes (2001) and Catch Me If You Can (2002). Heskin went on in 2003 to play Elizabeth Bennet in an independent film Pride & Prejudice: A Latter-Day Comedy, Robin in Charmed (2004),  and Paige Morgan in The Prince and Me film franchise (2006–2010).
+Thought 2: Kam Heskin plays Paige Morgan in The Prince and Me film franchise. I need to search The Prince and Me film franchise and find the director of the 2004 film.
+Action 2: Search[The Prince and Me film franchise]
+Observation 2: Could not find [The Prince and Me film franchise]. Similar: ['Kam Heskin', 'Prince of Persia', 'Harry Potter and the Half-Blood Prince (film)', 'Prince of Persia: The Sands of Time (film)', 'The Chronicles of Narnia (film series)', 'Majestic Prince (franchise)', 'Kingsman (franchise)', 'The Little Prince (2015 film)', 'Shrek (franchise)', 'Harry Potter (film series)']
+Thought 3: I need to search The Prince and Me (2004 film) instead.
+Action 3: Search[The Prince and Me (2004 film)]
+Observation 3: Could not find [The Prince and Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Kam Heskin', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince of Persia', 'Brooklynn Prince', 'Prince (musician)']
+Thought 4: I need to search The Prince & Me (2004 film).
+Action 4: Search[The Prince & Me (2004 film)]
+Observation 4: Could not find [The Prince & Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince (musician)', 'Prince of Persia', 'Kam Heskin', 'Brooklynn Prince']
+Thought 5: I need to search The Prince & Me (2004 film) instead.
+Action 5: Search[The Prince & Me (2004 film)]
+Observation 5: Could not find [The Prince & Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince (musician)', 'Prince of Persia', 'Kam Heskin', 'Brooklynn Prince']
+Thought 6: I need to search The Prince & Me (2004 film).
+Action 6: Search[The Prince & Me (2004 film)]
+Observation 6: Could not find [The Prince & Me (2004 film)]. Similar: ['The Prince & Me', 'The Prince & Me 2: The Royal Wedding', 'Prince of Darkness (film)', 'Prince of Persia: The Sands of Time (film)', 'Rob Knox', 'Alexander (2004 film)', 'Prince (musician)', 'Prince of Persia', 'Kam Heskin', 'Brooklynn Prince']
+
+Reflection: I got stuck in a loop where I kept trying to search 'The Prince & Me (2004 film)' but the page could not be found. Instead I should have tried to search the similar results that had a similar name to see and they were made in 2004.
--- a/hotpotqa_runs/root/base_cot_no_context/100_questions_5_trials.txt
+++ b/hotpotqa_runs/root/base_cot_no_context/100_questions_5_trials.txt
--- a/hotpotqa_runs/root/base_cot_no_context/cot_33_correct_dicts-5-trials.joblib
+++ b/hotpotqa_runs/root/base_cot_no_context/cot_33_correct_dicts-5-trials.joblib
--- a/hotpotqa_runs/root/base_react/100_questions_5_trials.txt
+++ b/hotpotqa_runs/root/base_react/100_questions_5_trials.txt
--- a/hotpotqa_runs/root/base_react/base_agents.joblib
+++ b/hotpotqa_runs/root/base_react/base_agents.joblib
--- a/hotpotqa_runs/root/base_react/base_react_dicts.joblib
+++ b/hotpotqa_runs/root/base_react/base_react_dicts.joblib
--- a/hotpotqa_runs/root/base_react/base_react_thought_log.txt
+++ b/hotpotqa_runs/root/base_react/base_react_thought_log.txt
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/0.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/0.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/1.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/1.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/10.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/10.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/11.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/11.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/12.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/12.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/13.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/13.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/14.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/14.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/15.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/15.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/16.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/16.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/17.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/17.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/18.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/18.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/19.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/19.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/2.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/2.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/20.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/20.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/21.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/21.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/22.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/22.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/23.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/23.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/24.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/24.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/25.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/25.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/26.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/26.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/27.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/27.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/28.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/28.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/29.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/29.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/3.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/3.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/30.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/30.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/31.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/31.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/32.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/32.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/33.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/33.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/34.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/34.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/35.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/35.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/36.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/36.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/37.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/37.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/38.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/38.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/39.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/39.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/4.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/4.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/40.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/40.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/41.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/41.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/42.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/42.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/43.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/43.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/44.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/44.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/45.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/45.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/46.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/46.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/47.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/47.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/48.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/48.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/49.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/49.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/5.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/5.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/50.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/50.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/51.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/51.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/52.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/52.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/53.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/53.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/54.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/54.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/55.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/55.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/56.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/56.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/57.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/57.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/58.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/58.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/59.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/59.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/6.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/6.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/60.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/60.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/61.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/61.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/62.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/62.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/63.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/63.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/64.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/64.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/65.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/65.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/66.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/66.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/67.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/67.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/68.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/68.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/69.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/69.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/7.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/7.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/70.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/70.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/71.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/71.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/72.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/72.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/73.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/73.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/74.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/74.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/75.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/75.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/76.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/76.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/77.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/77.joblib
--- a/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/78.joblib
+++ b/hotpotqa_runs/root/last_trial_cot_context/100_questions_5_trials/78.joblib
--- a/Show More
+++ b/Show More