Update LangSmith Walkthrough (#10564)

pull/10431/head^2
William FH 1 year ago committed by GitHub
parent cbb4860fcd
commit 596f294b01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -48,7 +48,7 @@
"First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
"You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable (if this isn't set, runs will be logged to the `default` project). This will automatically create the project for you if it doesn't exist. You must also set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables.\n",
"\n",
"For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/)\n",
"For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n",
"\n",
"**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
"\n",
@ -65,6 +65,17 @@
"However, in this example, we will use environment variables."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e4780363-f05a-4649-8b1a-9b449f960ce4",
"metadata": {},
"outputs": [],
"source": [
"# %pip install -U langchain langsmith --quiet\n",
"# %pip install google-search-results pandas --quiet"
]
},
{
"cell_type": "code",
"execution_count": 1,
@ -81,7 +92,7 @@
"os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
"os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # Update to your API key\n",
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # Update to your API key\n",
"\n",
"# Used by the agent in this tutorial\n",
"# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
@ -156,8 +167,6 @@
},
"outputs": [],
"source": [
"import asyncio\n",
"\n",
"inputs = [\n",
" \"How many people live in canada as of 2023?\",\n",
" \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
@ -170,20 +179,8 @@
" \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
" \"what is 1213 divided by 4345?\",\n",
"]\n",
"results = []\n",
"\n",
"\n",
"async def arun(agent, input_example):\n",
" try:\n",
" return await agent.arun(input_example)\n",
" except Exception as e:\n",
" # The agent sometimes makes mistakes! These will be captured by the tracing.\n",
" return e\n",
"\n",
"\n",
"for input_example in inputs:\n",
" results.append(arun(agent, input_example))\n",
"results = await asyncio.gather(*results)"
"results = agent.batch(inputs, return_exceptions=True)"
]
},
{
@ -389,53 +386,30 @@
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"View the evaluation results for project '2023-07-17-11-25-20-AgentExecutor' at:\n",
"https://dev.smith.langchain.com/projects/p/1c9baec3-ae86-4fac-9e99-e1b9f8e7818c?eval=true\n",
"Processed examples: 1\r"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Chain failed for example 5a2ac8da-8c2b-4d12-acb9-5c4b0f47fe8a. Error: LLMMathChain._evaluate(\"\n",
"Chain failed for example f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 with inputs {'input': \"what is dua lipa's boyfriend age raised to the .43 power?\"}\n",
"Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n",
"age_of_Dua_Lipa_boyfriend ** 0.43\n",
"\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed examples: 4\r"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Chain failed for example 91439261-1c86-4198-868b-a6c1cc8a051b. Error: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 68}]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed examples: 9\r"
"\") raised error: 'age_of_Dua_Lipa_boyfriend'. Please try again with a valid numerical expression\n",
"Chain failed for example 78c959a4-467d-4469-8bd7-c5f0b059bc4a with inputs {'input': \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\"}\n",
"Error Type: ValueError, Message: LLMMathChain._evaluate(\"\n",
"age ** 0.43\n",
"\") raised error: 'age'. Please try again with a valid numerical expression\n",
"Chain failed for example 6de48a56-3f30-4aac-b6cf-eee4b05ad43f with inputs {'input': \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\"}\n",
"Error Type: ToolException, Message: Too many arguments to single-input tool Calculator. Args: ['height ^ 0.13', {'height': 72}]\n"
]
}
],
"source": [
"from langchain.smith import (\n",
" arun_on_dataset,\n",
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
" run_on_dataset, \n",
")\n",
"\n",
"chain_results = await arun_on_dataset(\n",
"chain_results = run_on_dataset(\n",
" client=client,\n",
" dataset_name=dataset_name,\n",
" llm_or_chain_factory=agent_factory,\n",
@ -448,6 +422,218 @@
"# These are logged as warnings here and captured as errors in the tracing UI."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>input</th>\n",
" <th>output</th>\n",
" <th>reference</th>\n",
" <th>embedding_cosine_distance</th>\n",
" <th>correctness</th>\n",
" <th>helpfulness</th>\n",
" <th>fifth-grader-score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>78c959a4-467d-4469-8bd7-c5f0b059bc4a</th>\n",
" <td>{'input': 'who is dua lipa's boyfriend? what i...</td>\n",
" <td>{'Error': 'ValueError('LLMMathChain._evaluate(...</td>\n",
" <td>{'output': 'Romain Gavras' age raised to the 0...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>f8dfff24-d288-4d8e-ba94-c3cc33dd10d0</th>\n",
" <td>{'input': 'what is dua lipa's boyfriend age ra...</td>\n",
" <td>{'Error': 'ValueError('LLMMathChain._evaluate(...</td>\n",
" <td>{'output': 'Approximately 4.9888126515157.'}</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c78d5e84-3fbd-442f-affb-4b0e5806c439</th>\n",
" <td>{'input': 'how far is it from paris to boston ...</td>\n",
" <td>{'input': 'how far is it from paris to boston ...</td>\n",
" <td>{'output': 'The distance from Paris to Boston ...</td>\n",
" <td>0.007577</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>02cadef9-5794-49a9-8e43-acca977cab60</th>\n",
" <td>{'input': 'How many people live in canada as o...</td>\n",
" <td>{'input': 'How many people live in canada as o...</td>\n",
" <td>{'output': 'The current population of Canada a...</td>\n",
" <td>0.016324</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>e888a340-0486-4552-bb4b-911756e6bed7</th>\n",
" <td>{'input': 'what was the total number of points...</td>\n",
" <td>{'input': 'what was the total number of points...</td>\n",
" <td>{'output': '3'}</td>\n",
" <td>0.225076</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1b1f655b-754c-474d-8832-e6ec6bad3943</th>\n",
" <td>{'input': 'what was the total number of points...</td>\n",
" <td>{'input': 'what was the total number of points...</td>\n",
" <td>{'output': 'The total number of points scored ...</td>\n",
" <td>0.011580</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51f1b1f1-3b51-400f-b871-65f8a3a3c2d4</th>\n",
" <td>{'input': 'how many more points were scored in...</td>\n",
" <td>{'input': 'how many more points were scored in...</td>\n",
" <td>{'output': '15'}</td>\n",
" <td>0.251002</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83339364-0135-4efd-a24a-f3bd2a85e33a</th>\n",
" <td>{'input': 'what is 153 raised to .1312 power?'}</td>\n",
" <td>{'input': 'what is 153 raised to .1312 power?'...</td>\n",
" <td>{'output': '1.9347796717823205'}</td>\n",
" <td>0.127441</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6de48a56-3f30-4aac-b6cf-eee4b05ad43f</th>\n",
" <td>{'input': 'who is kendall jenner's boyfriend? ...</td>\n",
" <td>{'Error': 'ToolException(\"Too many arguments t...</td>\n",
" <td>{'output': 'Bad Bunny's height raised to the p...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0c41cc28-9c07-4550-8940-68b58cbc045e</th>\n",
" <td>{'input': 'what is 1213 divided by 4345?'}</td>\n",
" <td>{'input': 'what is 1213 divided by 4345?', 'ou...</td>\n",
" <td>{'output': '0.2791714614499425'}</td>\n",
" <td>0.144522</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" input \\\n",
"78c959a4-467d-4469-8bd7-c5f0b059bc4a {'input': 'who is dua lipa's boyfriend? what i... \n",
"f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 {'input': 'what is dua lipa's boyfriend age ra... \n",
"c78d5e84-3fbd-442f-affb-4b0e5806c439 {'input': 'how far is it from paris to boston ... \n",
"02cadef9-5794-49a9-8e43-acca977cab60 {'input': 'How many people live in canada as o... \n",
"e888a340-0486-4552-bb4b-911756e6bed7 {'input': 'what was the total number of points... \n",
"1b1f655b-754c-474d-8832-e6ec6bad3943 {'input': 'what was the total number of points... \n",
"51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 {'input': 'how many more points were scored in... \n",
"83339364-0135-4efd-a24a-f3bd2a85e33a {'input': 'what is 153 raised to .1312 power?'} \n",
"6de48a56-3f30-4aac-b6cf-eee4b05ad43f {'input': 'who is kendall jenner's boyfriend? ... \n",
"0c41cc28-9c07-4550-8940-68b58cbc045e {'input': 'what is 1213 divided by 4345?'} \n",
"\n",
" output \\\n",
"78c959a4-467d-4469-8bd7-c5f0b059bc4a {'Error': 'ValueError('LLMMathChain._evaluate(... \n",
"f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 {'Error': 'ValueError('LLMMathChain._evaluate(... \n",
"c78d5e84-3fbd-442f-affb-4b0e5806c439 {'input': 'how far is it from paris to boston ... \n",
"02cadef9-5794-49a9-8e43-acca977cab60 {'input': 'How many people live in canada as o... \n",
"e888a340-0486-4552-bb4b-911756e6bed7 {'input': 'what was the total number of points... \n",
"1b1f655b-754c-474d-8832-e6ec6bad3943 {'input': 'what was the total number of points... \n",
"51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 {'input': 'how many more points were scored in... \n",
"83339364-0135-4efd-a24a-f3bd2a85e33a {'input': 'what is 153 raised to .1312 power?'... \n",
"6de48a56-3f30-4aac-b6cf-eee4b05ad43f {'Error': 'ToolException(\"Too many arguments t... \n",
"0c41cc28-9c07-4550-8940-68b58cbc045e {'input': 'what is 1213 divided by 4345?', 'ou... \n",
"\n",
" reference \\\n",
"78c959a4-467d-4469-8bd7-c5f0b059bc4a {'output': 'Romain Gavras' age raised to the 0... \n",
"f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 {'output': 'Approximately 4.9888126515157.'} \n",
"c78d5e84-3fbd-442f-affb-4b0e5806c439 {'output': 'The distance from Paris to Boston ... \n",
"02cadef9-5794-49a9-8e43-acca977cab60 {'output': 'The current population of Canada a... \n",
"e888a340-0486-4552-bb4b-911756e6bed7 {'output': '3'} \n",
"1b1f655b-754c-474d-8832-e6ec6bad3943 {'output': 'The total number of points scored ... \n",
"51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 {'output': '15'} \n",
"83339364-0135-4efd-a24a-f3bd2a85e33a {'output': '1.9347796717823205'} \n",
"6de48a56-3f30-4aac-b6cf-eee4b05ad43f {'output': 'Bad Bunny's height raised to the p... \n",
"0c41cc28-9c07-4550-8940-68b58cbc045e {'output': '0.2791714614499425'} \n",
"\n",
" embedding_cosine_distance correctness \\\n",
"78c959a4-467d-4469-8bd7-c5f0b059bc4a NaN NaN \n",
"f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 NaN NaN \n",
"c78d5e84-3fbd-442f-affb-4b0e5806c439 0.007577 1.0 \n",
"02cadef9-5794-49a9-8e43-acca977cab60 0.016324 1.0 \n",
"e888a340-0486-4552-bb4b-911756e6bed7 0.225076 0.0 \n",
"1b1f655b-754c-474d-8832-e6ec6bad3943 0.011580 0.0 \n",
"51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 0.251002 1.0 \n",
"83339364-0135-4efd-a24a-f3bd2a85e33a 0.127441 1.0 \n",
"6de48a56-3f30-4aac-b6cf-eee4b05ad43f NaN NaN \n",
"0c41cc28-9c07-4550-8940-68b58cbc045e 0.144522 1.0 \n",
"\n",
" helpfulness fifth-grader-score \n",
"78c959a4-467d-4469-8bd7-c5f0b059bc4a NaN NaN \n",
"f8dfff24-d288-4d8e-ba94-c3cc33dd10d0 NaN NaN \n",
"c78d5e84-3fbd-442f-affb-4b0e5806c439 1.0 1.0 \n",
"02cadef9-5794-49a9-8e43-acca977cab60 1.0 1.0 \n",
"e888a340-0486-4552-bb4b-911756e6bed7 0.0 0.0 \n",
"1b1f655b-754c-474d-8832-e6ec6bad3943 0.0 0.0 \n",
"51f1b1f1-3b51-400f-b871-65f8a3a3c2d4 1.0 1.0 \n",
"83339364-0135-4efd-a24a-f3bd2a85e33a 1.0 1.0 \n",
"6de48a56-3f30-4aac-b6cf-eee4b05ad43f NaN NaN \n",
"0c41cc28-9c07-4550-8940-68b58cbc045e 1.0 1.0 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain_results.to_dataframe()"
]
},
{
"cell_type": "markdown",
"id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
@ -474,7 +660,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 18,
"id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
"metadata": {
"tags": []
@ -483,22 +669,22 @@
{
"data": {
"text/plain": [
"Run(id=UUID('e39f310b-c5a8-4192-8a59-6a9498e1cb85'), name='AgentExecutor', start_time=datetime.datetime(2023, 7, 17, 18, 25, 30, 653872), run_type=<RunTypeEnum.chain: 'chain'>, end_time=datetime.datetime(2023, 7, 17, 18, 25, 35, 359642), extra={'runtime': {'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.8', 'library_version': '0.0.231', 'runtime_version': '3.11.2'}, 'total_tokens': 512, 'prompt_tokens': 451, 'completion_tokens': 61}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-07-17T18:25:30.653872'}, {'name': 'end', 'time': '2023-07-17T18:25:35.359642'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('a75cf754-4f73-46fd-b126-9bcd0695e463'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('1c9baec3-ae86-4fac-9e99-e1b9f8e7818c'), child_run_ids=[UUID('40d0fdca-0b2b-47f4-a9da-f2b229aa4ed5'), UUID('cfa5130f-264c-4126-8950-ec1c4c31b800'), UUID('ba638a2f-2a57-45db-91e8-9a7a66a42c5a'), UUID('fcc29b5a-cdb7-4bcc-8194-47729bbdf5fb'), UUID('a6f92bf5-cfba-4747-9336-370cb00c928a'), UUID('65312576-5a39-4250-b820-4dfae7d73945')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361}})"
"Run(id=UUID('a6893e95-a9cc-43e0-b9fa-f471b0cfee83'), name='AgentExecutor', start_time=datetime.datetime(2023, 9, 13, 22, 34, 32, 177406), run_type='chain', end_time=datetime.datetime(2023, 9, 13, 22, 34, 37, 77740), extra={'runtime': {'cpu': {'time': {'sys': 3.153218304, 'user': 5.045262336}, 'percent': 0.0, 'ctx_switches': {'voluntary': 42164.0, 'involuntary': 0.0}}, 'mem': {'rss': 184205312.0}, 'library': 'langchain', 'runtime': 'python', 'platform': 'macOS-13.4.1-arm64-arm-64bit', 'sdk_version': '0.0.26', 'thread_count': 58.0, 'library_version': '0.0.286', 'runtime_version': '3.11.2', 'langchain_version': '0.0.286', 'py_implementation': 'CPython'}}, error=None, serialized=None, events=[{'name': 'start', 'time': '2023-09-13T22:34:32.177406'}, {'name': 'end', 'time': '2023-09-13T22:34:37.077740'}], inputs={'input': 'what is 1213 divided by 4345?'}, outputs={'output': '1213 divided by 4345 is approximately 0.2792.'}, reference_example_id=UUID('0c41cc28-9c07-4550-8940-68b58cbc045e'), parent_run_id=None, tags=['openai-functions', 'testing-notebook'], execution_order=1, session_id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), child_run_ids=[UUID('37faef05-b6b3-4cb7-a6db-471425e69b46'), UUID('2d6a895f-de2c-4f7f-b5f1-ca876d38e530'), UUID('e7d145e3-74b0-4f32-9240-3e370becdf8f'), UUID('10db62c9-fe4f-4aba-959a-ad02cfadfa20'), UUID('8dc46a27-8ab9-4f33-9ec1-660ca73ebb4f'), UUID('eccd042e-dde0-4425-b62f-e855e25d6b64')], child_runs=None, feedback_stats={'correctness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'helpfulness': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'fifth-grader-score': {'n': 1, 'avg': 1.0, 'mode': 1, 'is_all_model': True}, 'embedding_cosine_distance': {'n': 1, 'avg': 0.144522385071361, 'mode': 0.144522385071361, 'is_all_model': True}}, app_path='/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/7865a050-467e-4c58-9322-58a26f182ecb/r/a6893e95-a9cc-43e0-b9fa-f471b0cfee83', manifest_id=None, status='success', prompt_tokens=None, completion_tokens=None, total_tokens=None, first_token_time=None, parent_run_ids=None)"
]
},
"execution_count": 10,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"runs = list(client.list_runs(dataset_name=dataset_name))\n",
"runs = list(client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1))\n",
"runs[0]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 22,
"id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
"metadata": {
"tags": []
@ -507,21 +693,17 @@
{
"data": {
"text/plain": [
"{'correctness': {'n': 7, 'avg': 0.5714285714285714, 'mode': 1},\n",
" 'helpfulness': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
" 'fifth-grader-score': {'n': 7, 'avg': 0.7142857142857143, 'mode': 1},\n",
" 'embedding_cosine_distance': {'n': 7,\n",
" 'avg': 0.11462010799473926,\n",
" 'mode': 0.0130477459560272}}"
"TracerSessionResult(id=UUID('7865a050-467e-4c58-9322-58a26f182ecb'), start_time=datetime.datetime(2023, 9, 13, 22, 34, 10, 611846), name='test-dependable-stop-67', extra=None, tenant_id=UUID('ebbaf2eb-769b-4505-aca2-d11de10372a4'), run_count=None, latency_p50=None, latency_p99=None, total_tokens=None, prompt_tokens=None, completion_tokens=None, last_run_start_time=None, feedback_stats=None, reference_dataset_ids=None, run_facets=None)"
]
},
"execution_count": 11,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.read_project(project_id=runs[0].session_id).feedback_stats"
"# After some time, these will be populated.\n",
"client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats"
]
},
{

@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Union
from uuid import UUID
import langsmith
from langsmith import schemas as langsmith_schemas
from langsmith.evaluation.evaluator import EvaluationResult
from langchain.callbacks import manager
from langchain.callbacks.tracers import langchain as langchain_tracer
@ -76,7 +76,7 @@ class EvaluatorCallbackHandler(BaseTracer):
self.futures: Set[Future] = set()
self.skip_unfinished = skip_unfinished
self.project_name = project_name
self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
self.logged_eval_results: Dict[str, List[EvaluationResult]] = {}
def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
"""Evaluate the run in the project.
@ -91,11 +91,11 @@ class EvaluatorCallbackHandler(BaseTracer):
"""
try:
if self.project_name is None:
feedback = self.client.evaluate_run(run, evaluator)
eval_result = self.client.evaluate_run(run, evaluator)
with manager.tracing_v2_enabled(
project_name=self.project_name, tags=["eval"], client=self.client
):
feedback = self.client.evaluate_run(run, evaluator)
eval_result = self.client.evaluate_run(run, evaluator)
except Exception as e:
logger.error(
f"Error evaluating run {run.id} with "
@ -104,7 +104,7 @@ class EvaluatorCallbackHandler(BaseTracer):
)
raise e
example_id = str(run.reference_example_id)
self.logged_feedback.setdefault(example_id, []).append(feedback)
self.logged_eval_results.setdefault(example_id, []).append(eval_result)
def _persist_run(self, run: Run) -> None:
"""Run the evaluator on the run.

@ -866,7 +866,8 @@ def _prepare_eval_run(
f"Project {project_name} already exists. Please use a different name."
)
print(
f"View the evaluation results for project '{project_name}' at:\n{project.url}"
f"View the evaluation results for project '{project_name}' at:\n{project.url}",
flush=True,
)
dataset = client.read_dataset(dataset_name=dataset_name)
examples = list(client.list_examples(dataset_id=dataset.id))
@ -927,14 +928,14 @@ def _collect_test_results(
project_name: str,
) -> TestResult:
wait_for_all_tracers()
all_feedback = {}
all_eval_results = {}
for c in configs:
for callback in cast(list, c["callbacks"]):
if isinstance(callback, EvaluatorCallbackHandler):
all_feedback.update(callback.logged_feedback)
all_eval_results.update(callback.logged_eval_results)
results = {}
for example, output in zip(examples, batch_results):
feedback = all_feedback.get(str(example.id), [])
feedback = all_eval_results.get(str(example.id), [])
results[str(example.id)] = {
"output": output,
"input": example.inputs,

Loading…
Cancel
Save