@ -114,6 +114,7 @@ class EvaluatorCallbackHandler(BaseTracer):
try :
try :
if self . project_name is None :
if self . project_name is None :
eval_result = self . client . evaluate_run ( run , evaluator )
eval_result = self . client . evaluate_run ( run , evaluator )
eval_results = [ eval_result ]
with manager . tracing_v2_enabled (
with manager . tracing_v2_enabled (
project_name = self . project_name , tags = [ " eval " ] , client = self . client
project_name = self . project_name , tags = [ " eval " ] , client = self . client
) as cb :
) as cb :
@ -126,17 +127,10 @@ class EvaluatorCallbackHandler(BaseTracer):
run ,
run ,
example = reference_example ,
example = reference_example ,
)
)
run_id = cb . latest_run . id if cb . latest_run is not None else None
eval_results = self . _log_evaluation_feedback (
self . client . create_feedback (
evaluation_result ,
run . id ,
run ,
evaluation_result . key ,
source_run_id = cb . latest_run . id if cb . latest_run else None ,
score = evaluation_result . score ,
value = evaluation_result . value ,
comment = evaluation_result . comment ,
correction = evaluation_result . correction ,
source_info = evaluation_result . evaluator_info ,
source_run_id = evaluation_result . source_run_id or run_id ,
feedback_source_type = langsmith . schemas . FeedbackSourceType . MODEL ,
)
)
except Exception as e :
except Exception as e :
logger . error (
logger . error (
@ -147,9 +141,59 @@ class EvaluatorCallbackHandler(BaseTracer):
raise e
raise e
example_id = str ( run . reference_example_id )
example_id = str ( run . reference_example_id )
with self . lock :
with self . lock :
self . logged_eval_results . setdefault ( ( str ( run . id ) , example_id ) , [ ] ) . append (
for res in eval_results :
eval_result
run_id = (
str ( getattr ( res , " target_run_id " ) )
if hasattr ( res , " target_run_id " )
else str ( run . id )
)
self . logged_eval_results . setdefault ( ( run_id , example_id ) , [ ] ) . append (
res
)
def _select_eval_results (
self ,
results : Union [ EvaluationResult , dict ] ,
) - > List [ EvaluationResult ] :
if isinstance ( results , EvaluationResult ) :
results_ = [ results ]
elif isinstance ( results , dict ) and " results " in results :
results_ = cast ( List [ EvaluationResult ] , results [ " results " ] )
else :
raise TypeError (
f " Invalid evaluation result type { type ( results ) } . "
" Expected EvaluationResult or EvaluationResults. "
)
return results_
def _log_evaluation_feedback (
self ,
evaluator_response : Union [ EvaluationResult , dict ] ,
run : Run ,
source_run_id : Optional [ UUID ] = None ,
) - > List [ EvaluationResult ] :
results = self . _select_eval_results ( evaluator_response )
for res in results :
source_info_ : Dict [ str , Any ] = { }
if res . evaluator_info :
source_info_ = { * * res . evaluator_info , * * source_info_ }
run_id_ = (
getattr ( res , " target_run_id " )
if hasattr ( res , " target_run_id " ) and res . target_run_id is not None
else run . id
)
self . client . create_feedback (
run_id_ ,
res . key ,
score = res . score ,
value = res . value ,
comment = res . comment ,
correction = res . correction ,
source_info = source_info_ ,
source_run_id = res . source_run_id or source_run_id ,
feedback_source_type = langsmith . schemas . FeedbackSourceType . MODEL ,
)
)
return results
def _persist_run ( self , run : Run ) - > None :
def _persist_run ( self , run : Run ) - > None :
""" Run the evaluator on the run.
""" Run the evaluator on the run.