mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
|
"""Test run evaluator implementations basic functionality."""
|
||
|
|
||
|
from uuid import UUID
|
||
|
|
||
|
import pytest
|
||
|
from langchainplus_sdk.schemas import Example, Run
|
||
|
|
||
|
from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
|
||
|
from tests.unit_tests.llms.fake_llm import FakeLLM
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def run() -> Run:
|
||
|
return Run(
|
||
|
id=UUID("f77cd087-48f7-4c62-9e0e-297842202107"),
|
||
|
name="My Run",
|
||
|
inputs={"input": "What is the answer to life, the universe, and everything?"},
|
||
|
outputs={"output": "The answer is 42."},
|
||
|
start_time="2021-07-20T15:00:00.000000+00:00",
|
||
|
end_time="2021-07-20T15:00:00.000000+00:00",
|
||
|
run_type="chain",
|
||
|
execution_order=1,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def example() -> Example:
|
||
|
return Example(
|
||
|
id=UUID("f77cd087-48f7-4c62-9e0e-297842202106"),
|
||
|
dataset_id=UUID("f77cd087-48f7-4c62-9e0e-297842202105"),
|
||
|
inputs={"input": "What is the answer to life, the universe, and everything?"},
|
||
|
outputs={"output": "The answer is 42."},
|
||
|
created_at="2021-07-20T15:00:00.000000+00:00",
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_get_qa_evaluator(run: Run, example: Example) -> None:
|
||
|
"""Test get_qa_evaluator."""
|
||
|
eval_llm = FakeLLM(
|
||
|
queries={"a": "This checks out.\nCORRECT"}, sequential_responses=True
|
||
|
)
|
||
|
qa_evaluator = get_qa_evaluator(eval_llm)
|
||
|
res = qa_evaluator.evaluate_run(run, example)
|
||
|
assert res.value == "CORRECT"
|
||
|
assert res.score == 1
|
||
|
|
||
|
|
||
|
def test_get_criteria_evaluator(run: Run, example: Example) -> None:
|
||
|
"""Get a criteria evaluator."""
|
||
|
eval_llm = FakeLLM(queries={"a": "This checks out.\nY"}, sequential_responses=True)
|
||
|
criteria_evaluator = get_criteria_evaluator(eval_llm, criteria="conciseness")
|
||
|
res = criteria_evaluator.evaluate_run(run, example)
|
||
|
assert res.value == "Y"
|
||
|
assert res.score == 1
|