Simplify eval arg names (#6944)

It'll be easier to switch between these if the names of predictions are
consistent
pull/6976/head
William FH 1 year ago committed by GitHub
parent 8f5eca236f
commit 8c73037dff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -243,8 +243,8 @@
" pred_a, pred_b = res_b, res_a\n",
" a, b = \"b\", \"a\"\n",
" eval_res = eval_chain.evaluate_string_pairs(\n",
" output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
" output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
" prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
" prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
" input=input_\n",
" )\n",
" if eval_res[\"value\"] == \"A\":\n",

@ -105,7 +105,7 @@ class TrajectoryEvalChain(Chain):
result = eval_chain.evaluate_agent_trajectory(
input=question,
agent_trajectory=response["intermediate_steps"],
output=response["output"],
prediction=response["output"],
reference="Paris",
)
print(result["score"])
@ -325,9 +325,9 @@ The following is the expected answer. Use this to measure correctness:
def evaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
output: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
@ -338,7 +338,7 @@ The following is the expected answer. Use this to measure correctness:
input (str): The input question.
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
The intermediate steps forming the agent trajectory.
output (str): The expected output.
prediction (str): The expected prediction.
reference (Optional[str]): The reference answer.
Returns:
@ -347,7 +347,7 @@ The following is the expected answer. Use this to measure correctness:
inputs = {
"question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": output,
"answer": prediction,
"reference": self._format_reference(reference),
}
return self(inputs=inputs, callbacks=callbacks, **kwargs)
@ -355,9 +355,9 @@ The following is the expected answer. Use this to measure correctness:
async def aevaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
output: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
@ -368,7 +368,7 @@ The following is the expected answer. Use this to measure correctness:
input (str): The input question.
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
The intermediate steps forming the agent trajectory.
output (str): The expected output.
prediction (str): The expected prediction.
reference (Optional[str]): The reference answer.
Returns:
@ -377,7 +377,7 @@ The following is the expected answer. Use this to measure correctness:
inputs = {
"question": input,
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
"answer": output,
"answer": prediction,
"reference": self._format_reference(reference),
}
return await self.acall(

@ -12,8 +12,8 @@ Example:
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... output_a = "H2O",
... output_b = (
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... referenc = "The chemical formula for water is H2O.",

@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
>>> result = chain.evaluate_string_pairs(
... input = "What is the chemical formula for water?",
... output_a = "H2O",
... output_b = (
... prediction = "H2O",
... prediction_b = (
... "The chemical formula for water is H2O, which means"
... " there are two hydrogen atoms and one oxygen atom."
... referenc = "The chemical formula for water is H2O.",
@ -101,7 +101,7 @@ class PairwiseStringEvalChain(LLMChain):
Returns:
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
"""
expected_input_vars = {"output_a", "output_b", "input"}
expected_input_vars = {"prediction", "prediction_b", "input"}
if prompt is None:
if require_reference:
expected_input_vars.add("reference")
@ -121,11 +121,11 @@ class PairwiseStringEvalChain(LLMChain):
return cls(llm=llm, prompt=prompt_, **kwargs)
def _prepare_input(
self, output_a: str, output_b: str, input: str, reference: Optional[str]
self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
) -> dict:
input_ = {
"output_a": output_a,
"output_b": output_b,
"prediction": prediction,
"prediction_b": prediction_b,
"input": input,
}
if reference is not None and "reference" in self.prompt.input_variables:
@ -135,8 +135,8 @@ class PairwiseStringEvalChain(LLMChain):
def evaluate_string_pairs(
self,
*,
output_a: str,
output_b: str,
prediction: str,
prediction_b: str,
input: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
@ -145,8 +145,8 @@ class PairwiseStringEvalChain(LLMChain):
"""Evaluate whether output A is preferred to output B.
Args:
output_a (str): The output string from the first model.
output_b (str): The output string from the second model.
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
@ -160,7 +160,7 @@ class PairwiseStringEvalChain(LLMChain):
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
input_ = self._prepare_input(output_a, output_b, input, reference)
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = self(
inputs=input_,
callbacks=callbacks,
@ -171,8 +171,8 @@ class PairwiseStringEvalChain(LLMChain):
async def aevaluate_string_pairs(
self,
*,
output_a: str,
output_b: str,
prediction: str,
prediction_b: str,
input: str,
reference: Optional[str] = None,
callbacks: Callbacks = None,
@ -181,8 +181,8 @@ class PairwiseStringEvalChain(LLMChain):
"""Asynchronously evaluate whether output A is preferred to output B.
Args:
output_a (str): The output string from the first model.
output_b (str): The output string from the second model.
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
input (str): The input or task string.
callbacks (Callbacks, optional): The callbacks to use.
reference (str, optional): The reference string, if any.
@ -196,7 +196,7 @@ class PairwiseStringEvalChain(LLMChain):
- score: The preference score, which is 1 for 'A', 0 for 'B',
and 0.5 for None.
"""
input_ = self._prepare_input(output_a, output_b, input, reference)
input_ = self._prepare_input(prediction, prediction_b, input, reference)
result = await self.acall(
inputs=input_,
callbacks=callbacks,

@ -21,14 +21,14 @@ After giving your rationale, make your final decision using this format:\
[/QUESTION]
[RESPONSE A]
{output_a}
{prediction}
[/RESPONSE A]
[RESPONSE B]
{output_b}
{prediction_b}
[/RESPONSE B]"""
PROMPT = PromptTemplate(
input_variables=["input", "output_a", "output_b"], template=template
input_variables=["input", "prediction", "prediction_b"], template=template
)
template = """Act as a fair judge and rate the two responses to the question below.\
@ -52,13 +52,14 @@ After giving your rationale, make your final decision using this format:\
[/QUESTION]
[RESPONSE A]
{output_a}
{prediction}
[/RESPONSE A]
[RESPONSE B]
{output_b}
{prediction_b}
[/RESPONSE B]"""
PROMPT_WITH_REFERENCE = PromptTemplate(
input_variables=["input", "output_a", "output_b", "reference"], template=template
input_variables=["input", "prediction", "prediction_b", "reference"],
template=template,
)

@ -62,8 +62,8 @@ class PairwiseStringEvaluator(Protocol):
def evaluate_string_pairs(
self,
*,
output_a: str,
output_b: str,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
@ -71,8 +71,8 @@ class PairwiseStringEvaluator(Protocol):
"""Evaluate the output string pairs.
Args:
output_a (str): The output string from the first model.
output_b (str): The output string from the second model.
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference
string. Defaults to None.
input (str, optional): The input string. Defaults to None.
@ -86,8 +86,8 @@ class PairwiseStringEvaluator(Protocol):
async def aevaluate_string_pairs(
self,
output_a: str,
output_b: str,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
@ -95,8 +95,8 @@ class PairwiseStringEvaluator(Protocol):
"""Evaluate the output string pairs.
Args:
output_a (str): The output string from the first model.
output_b (str): The output string from the second model.
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (str, optional): The expected output / reference
string. Defaults to None.
input (str, optional): The input string. Defaults to None.

@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
res = chain.evaluate_agent_trajectory(
input="What is your favorite food?",
agent_trajectory=intermediate_steps,
output="I like pie.",
prediction="I like pie.",
)
assert res["score"] == 5
# Test when ref is provided
res = chain.evaluate_agent_trajectory(
input="What is your favorite food?",
agent_trajectory=intermediate_steps,
output="I like pie.",
prediction="I like pie.",
reference="Paris",
)
assert res["score"] == 1
@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
res = chain.evaluate_agent_trajectory(
input="What is your favorite food?",
agent_trajectory=intermediate_steps,
output="I like pie.",
prediction="I like pie.",
)
assert res["score"] == 5
res = chain.evaluate_agent_trajectory(
input="What is your favorite food?",
agent_trajectory=intermediate_steps,
output="I like pie.",
prediction="I like pie.",
reference="Paris",
)
assert res["score"] == 1

@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
)
chain = PairwiseStringEvalChain.from_llm(llm=llm)
res = chain.evaluate_string_pairs(
output_a="I like pie.",
output_b="I love pie.",
prediction="I like pie.",
prediction_b="I love pie.",
input="What is your favorite food?",
)
assert res["value"] is None
assert res["score"] == 0.5
assert res["reasoning"] == "The values are the same."
res = chain.evaluate_string_pairs(
output_a="I like pie.",
output_b="I like pie.",
prediction="I like pie.",
prediction_b="I like pie.",
input="What is your favorite food?",
)
assert res["value"] == "A"
assert res["score"] == 1
res = chain.evaluate_string_pairs(
output_a="I like pie.",
output_b="I hate pie.",
prediction="I like pie.",
prediction_b="I hate pie.",
input="What is your favorite food?",
)
assert res["value"] == "B"

Loading…
Cancel
Save