added notebook

3 months ago · ea54dc29e7
parent 74ba01c3e7
commit ea54dc29e7
1 changed files with 336 additions and 18 deletions
--- a/examples/evaluation/ragas/openai-ragas-eval-cookbook.ipynb
+++ b/examples/evaluation/ragas/openai-ragas-eval-cookbook.ipynb
@ -7,6 +7,10 @@
   "source": [
    "# Introduction\n",
    "\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/shahules786/openai-cookbook/blob/ragas/examples/evaluation/ragas/openai-ragas-eval-cookbook.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n",
+    "\n",
    "Ragas is the de-facto opensource standard for RAG evaluations. Ragas provides features and methods to help evaluate RAG applications. In this notebook we will cover basic steps for evaluating your RAG application with Ragas. \n",
    "\n",
    "### Contents\n",
@ -33,17 +37,18 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "! pip install ragas"
+    "! pip install -q ragas"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "id": "ebc22310-92b3-4819-ae93-2b9a85c25ef7",
+   "execution_count": 16,
+   "id": "f9e283b4-ae3f-4e76-b990-5b890b5364fa",
   "metadata": {},
   "outputs": [],
   "source": [
-    "! export OPENAI_API_KEY=\"your-openai-key\""
+    "import os\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<your-openai-key>\""
   ]
  },
  {
@ -73,24 +78,114 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "326f3dc1-775f-4ec5-8f27-afd76e9b5b22",
   "metadata": {},
   "outputs": [],
   "source": [
-    "from datasets import load_dataset\n",
-    "PATH = \"\""
+    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "id": "1ee49bc5-4661-4435-8463-197877c18fa3",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset json (/Users/shahules/.cache/huggingface/datasets/explodinggradients___json/explodinggradients--prompt-engineering-guide-papers-9147f70034f5334d/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "89035d84b3e04d489f59ef9673fa716a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>ground_truth</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>How does instruction tuning affect the zero-sh...</td>\n",
+       "      <td>For larger models on the order of 100B paramet...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What is the Zero-shot-CoT method and how does ...</td>\n",
+       "      <td>Zero-shot-CoT is a zero-shot template-based pr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>How does prompt tuning affect model performanc...</td>\n",
+       "      <td>Prompt tuning improves model performance in im...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>What is the purpose of instruction tuning in l...</td>\n",
+       "      <td>The purpose of instruction tuning in language ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What distinguishes Zero-shot-CoT from Few-shot...</td>\n",
+       "      <td>Zero-shot-CoT differs from Few-shot-CoT in tha...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            question                                       ground_truth\n",
+       "0  How does instruction tuning affect the zero-sh...  For larger models on the order of 100B paramet...\n",
+       "1  What is the Zero-shot-CoT method and how does ...  Zero-shot-CoT is a zero-shot template-based pr...\n",
+       "2  How does prompt tuning affect model performanc...  Prompt tuning improves model performance in im...\n",
+       "3  What is the purpose of instruction tuning in l...  The purpose of instruction tuning in language ...\n",
+       "4  What distinguishes Zero-shot-CoT from Few-shot...  Zero-shot-CoT differs from Few-shot-CoT in tha..."
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
-    "eval_dataset = load_dataset(PATH)\n",
-    "eval_dataset"
+    "eval_dataset = load_dataset(\"explodinggradients/prompt-engineering-guide-papers\")\n",
+    "eval_dataset = eval_dataset['test'].to_pandas()\n",
+    "eval_dataset.head()"
   ]
  },
  {
@ -98,42 +193,265 @@
   "id": "84ae0719-82bc-4103-8299-3df7021951e1",
   "metadata": {},
   "source": [
-    "As you can see, the dataset contains all the required attributes mentioned above. Now we can move on our next step of actually doing the evaluation with it.\n",
+    "As you can see, the dataset contains two of the required attributes mentioned,that is `question` and `ground_truth` answers. Now we can move on our next step to collect the other two attributes.\n",
    "\n",
    "**Note:**\n",
    "*We know that it's hard to formulate a test data containing Question and ground truth answer pairs when starting out. We have the perfect solution for this in this form of a ragas synthetic test data generation feature. The questions and ground truth answers were created by [ragas synthetic data generation]() feature. Check it out here once you finish this notebook*"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "6184b6b5-7373-4665-9754-b4fc08929000",
+   "metadata": {},
+   "source": [
+    "#### Simple RAG pipeline\n",
+    "\n",
+    "Now with the above step we have two attributes needed for evaluation, that is `question` and `ground_truth` answers. We now need to feed these test questions to our RAG pipeline to collect the other two attributes, ie `contexts` and `answer`.  Let's build a simple RAG using llama-index to do that. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d7bbceb1-5e05-422d-8690-f49fc71245d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "from llama_index.core.indices import VectorStoreIndex\n",
+    "from llama_index.core.readers import SimpleDirectoryReader\n",
+    "from llama_index.core.service_context import ServiceContext\n",
+    "from datasets import Dataset\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "\n",
+    "def build_query_engine(documents):\n",
+    "    vector_index = VectorStoreIndex.from_documents(\n",
+    "        documents, service_context=ServiceContext.from_defaults(chunk_size=512),\n",
+    "    )\n",
+    "\n",
+    "    query_engine = vector_index.as_query_engine(similarity_top_k=3)\n",
+    "    return query_engine\n",
+    "\n",
+    "# Function to evaluate as Llama index does not support async evaluation for HFInference API\n",
+    "def generate_responses(query_engine, test_questions, test_answers):\n",
+    "  responses = [query_engine.query(q) for q in test_questions]\n",
+    "\n",
+    "  answers = []\n",
+    "  contexts = []\n",
+    "  for r in responses:\n",
+    "    answers.append(r.response)\n",
+    "    contexts.append([c.node.get_content() for c in r.source_nodes])\n",
+    "  dataset_dict = {\n",
+    "        \"question\": test_questions,\n",
+    "        \"answer\": answers,\n",
+    "        \"contexts\": contexts,\n",
+    "  }\n",
+    "  if test_answers is not None:\n",
+    "    dataset_dict[\"ground_truth\"] = test_answers\n",
+    "  ds = Dataset.from_dict(dataset_dict)\n",
+    "  return ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d299259a-1064-44d4-9c96-5ae423d9e2f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reader = SimpleDirectoryReader(\"/Users/shahules/Myprojects/ragas/experiments/prompt-engineering-papers/\",num_files_limit=30, required_exts=[\".pdf\"])\n",
+    "documents = reader.load_data()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "546f7b5c-36b7-4ffc-90d6-bea27df01aa5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_questions = eval_dataset['question'].values.tolist()\n",
+    "test_answers = eval_dataset['ground_truth'].values.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "52a67da6-2eeb-452c-a90c-5c1ea860545b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_engine1 = build_query_engine(documents)\n",
+    "result_ds = generate_responses(query_engine1, test_questions, test_answers)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "6be10812-894e-43ba-857d-36627eb54dc8",
   "metadata": {},
   "source": [
    "## Evaluation\n",
-    "For evaluation ragas provides several metrics which is aimed to quantify the end-end performance of the pipeline and also the component wise performance of the pipeline. For this tutorial let's consider few of them"
+    "For evaluation ragas provides several metrics which is aimed to quantify the end-end performance of the pipeline and also the component wise performance of the pipeline. For this tutorial let's consider few of them\n",
+    "\n",
+    "**Note**: *Refer to our [metrics](https://docs.ragas.io/en/stable/concepts/metrics/index.html) docs to read more about different metrics.*"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
   "id": "98b6cbba-fecb-4b92-8cd9-839d80025b22",
   "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3c413241484641c6984b6b95af0367c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from ragas.metrics import answer_correctness, faithfulness \n",
+    "from ragas import evaluate\n",
+    "\n",
+    "ragas_results = evaluate(result_ds, metrics=[answer_correctness, faithfulness ])"
+   ]
  },
  {
   "cell_type": "markdown",
   "id": "7fa8eaaa-b6b9-4ae9-a2df-f65f0559b565",
   "metadata": {},
   "source": [
-    "## Analysis"
+    "## Analysis\n",
+    "You can export the individual scores to dataframe and analyse it. You can also add [callbacks and tracing](https://docs.ragas.io/en/latest/howtos/applications/tracing.html) to ragas to do indepth analysis."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
   "id": "2ff74280-02c4-4992-998d-4a9689e47b89",
   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>answer</th>\n",
+       "      <th>contexts</th>\n",
+       "      <th>ground_truth</th>\n",
+       "      <th>answer_correctness</th>\n",
+       "      <th>faithfulness</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>How does instruction tuning affect the zero-sh...</td>\n",
+       "      <td>Instruction tuning enhances the zero-shot perf...</td>\n",
+       "      <td>[34\\nthe effectiveness of different constructi...</td>\n",
+       "      <td>For larger models on the order of 100B paramet...</td>\n",
+       "      <td>0.781983</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>What is the Zero-shot-CoT method and how does ...</td>\n",
+       "      <td>Zero-shot-CoT is a method that involves append...</td>\n",
+       "      <td>[Plan-and-Solve Prompting: Improving Zero-Shot...</td>\n",
+       "      <td>Zero-shot-CoT is a zero-shot template-based pr...</td>\n",
+       "      <td>0.667026</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>How does prompt tuning affect model performanc...</td>\n",
+       "      <td>Prompt tuning can impact model performance in ...</td>\n",
+       "      <td>[4 C. Liu et al.\\nto generate results directly...</td>\n",
+       "      <td>Prompt tuning improves model performance in im...</td>\n",
+       "      <td>0.396040</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>What is the purpose of instruction tuning in l...</td>\n",
+       "      <td>The purpose of instruction tuning in language ...</td>\n",
+       "      <td>[In practice,\\ninstruction tuning offers a gen...</td>\n",
+       "      <td>The purpose of instruction tuning in language ...</td>\n",
+       "      <td>0.694074</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What distinguishes Zero-shot-CoT from Few-shot...</td>\n",
+       "      <td>Zero-shot-CoT conditions the LM on a single pr...</td>\n",
+       "      <td>[Wei et al. (2022b ) observe that the success ...</td>\n",
+       "      <td>Zero-shot-CoT differs from Few-shot-CoT in tha...</td>\n",
+       "      <td>0.530018</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            question  ... faithfulness\n",
+       "0  How does instruction tuning affect the zero-sh...  ...          1.0\n",
+       "1  What is the Zero-shot-CoT method and how does ...  ...          1.0\n",
+       "2  How does prompt tuning affect model performanc...  ...          1.0\n",
+       "3  What is the purpose of instruction tuning in l...  ...          1.0\n",
+       "4  What distinguishes Zero-shot-CoT from Few-shot...  ...          1.0\n",
+       "\n",
+       "[5 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ragas_results.to_pandas().head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7e7242c-a785-4ac3-94c8-2c1e795bb53a",
+   "metadata": {},
+   "source": [
+    "**If you liked this tutorial, checkout [ragas](https://github.com/explodinggradients/ragas) and consider leaving a star**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c020fec7-8451-46ae-8b2d-192ae468428e",
+   "metadata": {},
   "outputs": [],
   "source": []
  }