mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
c460b04c64
- Add protocol for `evaluate_strings` - Move the criteria evaluator out so it's not restricted to being applied on traced runs
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
"""Test run evaluator implementations basic functionality."""
|
|
|
|
from uuid import UUID
|
|
|
|
import pytest
|
|
from langchainplus_sdk.schemas import Example, Run
|
|
|
|
from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
|
|
from tests.unit_tests.llms.fake_llm import FakeLLM
|
|
|
|
|
|
@pytest.fixture
|
|
def run() -> Run:
|
|
return Run(
|
|
id=UUID("f77cd087-48f7-4c62-9e0e-297842202107"),
|
|
name="My Run",
|
|
inputs={"input": "What is the answer to life, the universe, and everything?"},
|
|
outputs={"output": "The answer is 42."},
|
|
start_time="2021-07-20T15:00:00.000000+00:00",
|
|
end_time="2021-07-20T15:00:00.000000+00:00",
|
|
run_type="chain",
|
|
execution_order=1,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def example() -> Example:
|
|
return Example(
|
|
id=UUID("f77cd087-48f7-4c62-9e0e-297842202106"),
|
|
dataset_id=UUID("f77cd087-48f7-4c62-9e0e-297842202105"),
|
|
inputs={"input": "What is the answer to life, the universe, and everything?"},
|
|
outputs={"output": "The answer is 42."},
|
|
created_at="2021-07-20T15:00:00.000000+00:00",
|
|
)
|
|
|
|
|
|
def test_get_qa_evaluator(run: Run, example: Example) -> None:
|
|
"""Test get_qa_evaluator."""
|
|
eval_llm = FakeLLM(
|
|
queries={"a": "This checks out.\nCORRECT"}, sequential_responses=True
|
|
)
|
|
qa_evaluator = get_qa_evaluator(eval_llm)
|
|
res = qa_evaluator.evaluate_run(run, example)
|
|
assert res.value == "CORRECT"
|
|
assert res.score == 1
|
|
|
|
|
|
def test_get_criteria_evaluator(run: Run, example: Example) -> None:
|
|
"""Get a criteria evaluator."""
|
|
eval_llm = FakeLLM(queries={"a": "This checks out.\nY"}, sequential_responses=True)
|
|
criteria_evaluator = get_criteria_evaluator(eval_llm, criteria="conciseness")
|
|
res = criteria_evaluator.evaluate_run(run, example)
|
|
assert res.value == "Y"
|
|
assert res.score == 1
|