You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/experimental/langchain_experimental/comprehend_moderation/prompt_safety.py

90 lines
3.1 KiB
Python

import asyncio
from typing import Any, Optional
from langchain_experimental.comprehend_moderation.base_moderation_exceptions import (
ModerationPromptSafetyError,
)
class ComprehendPromptSafety:
"""Class to handle prompt safety moderation."""
def __init__(
self,
client: Any,
callback: Optional[Any] = None,
unique_id: Optional[str] = None,
chain_id: Optional[str] = None,
) -> None:
self.client = client
self.moderation_beacon = {
"moderation_chain_id": chain_id,
"moderation_type": "PromptSafety",
"moderation_status": "LABELS_NOT_FOUND",
}
self.callback = callback
self.unique_id = unique_id
def _get_arn(self) -> str:
region_name = self.client.meta.region_name
service = "comprehend"
prompt_safety_endpoint = "document-classifier-endpoint/prompt-safety"
return f"arn:aws:{service}:{region_name}:aws:{prompt_safety_endpoint}"
def validate(self, prompt_value: str, config: Any = None) -> str:
"""
Check and validate the safety of the given prompt text.
Args:
prompt_value (str): The input text to be checked for unsafe text.
config (Dict[str, Any]): Configuration settings for prompt safety checks.
Raises:
ValueError: If unsafe prompt is found in the prompt text based
on the specified threshold.
Returns:
str: The input prompt_value.
Note:
This function checks the safety of the provided prompt text using
Comprehend's classify_document API and raises an error if unsafe
text is detected with a score above the specified threshold.
Example:
comprehend_client = boto3.client('comprehend')
prompt_text = "Please tell me your credit card information."
config = {"threshold": 0.7}
checked_prompt = check_prompt_safety(comprehend_client, prompt_text, config)
"""
threshold = config.get("threshold")
unsafe_prompt = False
endpoint_arn = self._get_arn()
response = self.client.classify_document(
Text=prompt_value, EndpointArn=endpoint_arn
)
if self.callback and self.callback.prompt_safety_callback:
self.moderation_beacon["moderation_input"] = prompt_value
self.moderation_beacon["moderation_output"] = response
for class_result in response["Classes"]:
if (
class_result["Score"] >= threshold
and class_result["Name"] == "UNSAFE_PROMPT"
):
unsafe_prompt = True
break
if self.callback and self.callback.intent_callback:
if unsafe_prompt:
self.moderation_beacon["moderation_status"] = "LABELS_FOUND"
asyncio.create_task(
self.callback.on_after_intent(self.moderation_beacon, self.unique_id)
)
if unsafe_prompt:
raise ModerationPromptSafetyError
return prompt_value