langchain/docs/extras/modules/evaluation/string/custom.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4460f924-1738-4dc5-999f-c26383aba0a4",
   "metadata": {},
   "source": [
    "# Custom String Evaluator\n",
    "\n",
    "You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
    "\n",
    "In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",
    "[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# %pip install evaluate > /dev/null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from typing import Any, Optional\n",
    "\n",
    "from langchain.evaluation import StringEvaluator\n",
    "from evaluate import load\n",
    "\n",
    "\n",
    "class PerplexityEvaluator(StringEvaluator):\n",
    "    \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
    "\n",
    "    def __init__(self, model_id: str = \"gpt2\"):\n",
    "        self.model_id = model_id\n",
    "        self.metric_fn = load(\n",
    "            \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",
    "        )\n",
    "\n",
    "    def _evaluate_strings(\n",
    "        self,\n",
    "        *,\n",
    "        prediction: str,\n",
    "        reference: Optional[str] = None,\n",
    "        input: Optional[str] = None,\n",
    "        **kwargs: Any,\n",
    "    ) -> dict:\n",
    "        results = self.metric_fn.compute(\n",
    "            predictions=[prediction], model_id=self.model_id\n",
    "        )\n",
    "        ppl = results[\"perplexities\"][0]\n",
    "        return {\"score\": ppl}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "evaluator = PerplexityEvaluator()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using pad_token, but it is not set yet.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "467109d44654486e8b415288a319fc2c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'score': 190.3675537109375}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using pad_token, but it is not set yet.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'score': 1982.0709228515625}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",
    "evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
Evals docs (#7460) Still don't have good "how to's", and the guides / examples section could be further pruned and improved, but this PR adds a couple examples for each of the common evaluator interfaces. - [x] Example docs for each implemented evaluator - [x] "how to make a custom evalutor" notebook for each low level APIs (comparison, string, agent) - [x] Move docs to modules area - [x] Link to reference docs for more information - [X] Still need to finish the evaluation index page - ~[ ] Don't have good data generation section~ - ~[ ] Don't have good how to section for other common scenarios / FAQs like regression testing, testing over similar inputs to measure sensitivity, etc.~ 2023-07-18 08:00:01 +00:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"id": "4460f924-1738-4dc5-999f-c26383aba0a4",`
			`"metadata": {},`
			`"source": [`
			`"# Custom String Evaluator\n",`
			`"\n",`
			"You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
			`"\n",`
			`"In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",`
			`"[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
			`"source": [`
			`"# %pip install evaluate > /dev/null"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
			`"source": [`
			`"from typing import Any, Optional\n",`
			`"\n",`
			`"from langchain.evaluation import StringEvaluator\n",`
			`"from evaluate import load\n",`
			`"\n",`
			`"\n",`
			`"class PerplexityEvaluator(StringEvaluator):\n",`
			`" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",`
			`"\n",`
			`" def __init__(self, model_id: str = \"gpt2\"):\n",`
			`" self.model_id = model_id\n",`
			`" self.metric_fn = load(\n",`
			`" \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",`
			`" )\n",`
			`"\n",`
			`" def _evaluate_strings(\n",`
			`" self,\n",`
			`" *,\n",`
			`" prediction: str,\n",`
			`" reference: Optional[str] = None,\n",`
			`" input: Optional[str] = None,\n",`
			`" **kwargs: Any,\n",`
			`" ) -> dict:\n",`
			`" results = self.metric_fn.compute(\n",`
			`" predictions=[prediction], model_id=self.model_id\n",`
			`" )\n",`
			`" ppl = results[\"perplexities\"][0]\n",`
			`" return {\"score\": ppl}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"id": "52767568-8075-4f77-93c9-80e1a7e5cba3",`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
			`"source": [`
			`"evaluator = PerplexityEvaluator()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"Using pad_token, but it is not set yet.\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",`
			`"To disable this warning, you can either:\n",`
			"\t- Avoid using `tokenizers` before the fork if possible\n",
			`"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true \| false)\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "467109d44654486e8b415288a319fc2c",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/1 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"text/plain": [`
			`"{'score': 190.3675537109375}"`
			`]`
			`},`
			`"execution_count": 4,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"Using pad_token, but it is not set yet.\n"`
			`]`
			`},`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`" 0%\| \| 0/1 [00:00<?, ?it/s]"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`},`
			`{`
			`"data": {`
			`"text/plain": [`
			`"{'score': 1982.0709228515625}"`
			`]`
			`},`
			`"execution_count": 6,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",`
			`"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.11.2"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`