# flake8: noqa import os import warnings from typing import Any, Dict, List, Optional, Union from langchain_core.callbacks import BaseCallbackHandler from langchain_core.agents import AgentAction, AgentFinish from langchain_core.outputs import LLMResult class DeepEvalCallbackHandler(BaseCallbackHandler): """Callback Handler that logs into deepeval. Args: implementation_name: name of the `implementation` in deepeval metrics: A list of metrics Raises: ImportError: if the `deepeval` package is not installed. Examples: >>> from langchain_community.llms import OpenAI >>> from langchain_community.callbacks import DeepEvalCallbackHandler >>> from deepeval.metrics import AnswerRelevancy >>> metric = AnswerRelevancy(minimum_score=0.3) >>> deepeval_callback = DeepEvalCallbackHandler( ... implementation_name="exampleImplementation", ... metrics=[metric], ... ) >>> llm = OpenAI( ... temperature=0, ... callbacks=[deepeval_callback], ... verbose=True, ... openai_api_key="API_KEY_HERE", ... ) >>> llm.generate([ ... "What is the best evaluation tool out there? (no bias at all)", ... ]) "Deepeval, no doubt about it." """ REPO_URL: str = "https://github.com/confident-ai/deepeval" ISSUES_URL: str = f"{REPO_URL}/issues" BLOG_URL: str = "https://docs.confident-ai.com" # noqa: E501 def __init__( self, metrics: List[Any], implementation_name: Optional[str] = None, ) -> None: """Initializes the `deepevalCallbackHandler`. Args: implementation_name: Name of the implementation you want. metrics: What metrics do you want to track? Raises: ImportError: if the `deepeval` package is not installed. ConnectionError: if the connection to deepeval fails. """ super().__init__() # Import deepeval (not via `import_deepeval` to keep hints in IDEs) try: import deepeval # ignore: F401,I001 except ImportError: raise ImportError( """To use the deepeval callback manager you need to have the `deepeval` Python package installed. Please install it with `pip install deepeval`""" ) if os.path.exists(".deepeval"): warnings.warn( """You are currently not logging anything to the dashboard, we recommend using `deepeval login`.""" ) # Set the deepeval variables self.implementation_name = implementation_name self.metrics = metrics warnings.warn( ( "The `DeepEvalCallbackHandler` is currently in beta and is subject to" " change based on updates to `langchain`. Please report any issues to" f" {self.ISSUES_URL} as an `integration` issue." ), ) def on_llm_start( self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any ) -> None: """Store the prompts""" self.prompts = prompts def on_llm_new_token(self, token: str, **kwargs: Any) -> None: """Do nothing when a new token is generated.""" pass def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: """Log records to deepeval when an LLM ends.""" from deepeval.metrics.answer_relevancy import AnswerRelevancy from deepeval.metrics.bias_classifier import UnBiasedMetric from deepeval.metrics.metric import Metric from deepeval.metrics.toxic_classifier import NonToxicMetric for metric in self.metrics: for i, generation in enumerate(response.generations): # Here, we only measure the first generation's output output = generation[0].text query = self.prompts[i] if isinstance(metric, AnswerRelevancy): result = metric.measure( output=output, query=query, ) print(f"Answer Relevancy: {result}") elif isinstance(metric, UnBiasedMetric): score = metric.measure(output) print(f"Bias Score: {score}") elif isinstance(metric, NonToxicMetric): score = metric.measure(output) print(f"Toxic Score: {score}") else: raise ValueError( f"""Metric {metric.__name__} is not supported by deepeval callbacks.""" ) def on_llm_error(self, error: BaseException, **kwargs: Any) -> None: """Do nothing when LLM outputs an error.""" pass def on_chain_start( self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any ) -> None: """Do nothing when chain starts""" pass def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None: """Do nothing when chain ends.""" pass def on_chain_error(self, error: BaseException, **kwargs: Any) -> None: """Do nothing when LLM chain outputs an error.""" pass def on_tool_start( self, serialized: Dict[str, Any], input_str: str, **kwargs: Any, ) -> None: """Do nothing when tool starts.""" pass def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any: """Do nothing when agent takes a specific action.""" pass def on_tool_end( self, output: str, observation_prefix: Optional[str] = None, llm_prefix: Optional[str] = None, **kwargs: Any, ) -> None: """Do nothing when tool ends.""" pass def on_tool_error(self, error: BaseException, **kwargs: Any) -> None: """Do nothing when tool outputs an error.""" pass def on_text(self, text: str, **kwargs: Any) -> None: """Do nothing""" pass def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None: """Do nothing""" pass