langchain/libs/community/langchain_community/callbacks/openai_info.py
joshy-deshaw bf5385592e
core, community: propagate context between threads (#15171)
While using `chain.batch`, the default implementation uses a
`ThreadPoolExecutor` and run the chains in separate threads. An issue
with this approach is that that [the token counting
callback](https://python.langchain.com/docs/modules/callbacks/token_counting)
fails to work as a consequence of the context not being propagated
between threads. This PR adds context propagation to the new threads and
adds some thread synchronization in the OpenAI callback. With this
change, the token counting callback works as intended.

Having the context propagation change would be highly beneficial for
those implementing custom callbacks for similar functionalities as well.

---------

Co-authored-by: Nuno Campos <nuno@langchain.dev>
2023-12-28 14:51:22 -08:00

225 lines
7.5 KiB
Python

"""Callback Handler that prints to std out."""
import threading
from typing import Any, Dict, List
from langchain_core.callbacks import BaseCallbackHandler
from langchain_core.outputs import LLMResult
MODEL_COST_PER_1K_TOKENS = {
# GPT-4 input
"gpt-4": 0.03,
"gpt-4-0314": 0.03,
"gpt-4-0613": 0.03,
"gpt-4-32k": 0.06,
"gpt-4-32k-0314": 0.06,
"gpt-4-32k-0613": 0.06,
"gpt-4-vision-preview": 0.01,
"gpt-4-1106-preview": 0.01,
# GPT-4 output
"gpt-4-completion": 0.06,
"gpt-4-0314-completion": 0.06,
"gpt-4-0613-completion": 0.06,
"gpt-4-32k-completion": 0.12,
"gpt-4-32k-0314-completion": 0.12,
"gpt-4-32k-0613-completion": 0.12,
"gpt-4-vision-preview-completion": 0.03,
"gpt-4-1106-preview-completion": 0.03,
# GPT-3.5 input
"gpt-3.5-turbo": 0.0015,
"gpt-3.5-turbo-0301": 0.0015,
"gpt-3.5-turbo-0613": 0.0015,
"gpt-3.5-turbo-1106": 0.001,
"gpt-3.5-turbo-instruct": 0.0015,
"gpt-3.5-turbo-16k": 0.003,
"gpt-3.5-turbo-16k-0613": 0.003,
# GPT-3.5 output
"gpt-3.5-turbo-completion": 0.002,
"gpt-3.5-turbo-0301-completion": 0.002,
"gpt-3.5-turbo-0613-completion": 0.002,
"gpt-3.5-turbo-1106-completion": 0.002,
"gpt-3.5-turbo-instruct-completion": 0.002,
"gpt-3.5-turbo-16k-completion": 0.004,
"gpt-3.5-turbo-16k-0613-completion": 0.004,
# Azure GPT-35 input
"gpt-35-turbo": 0.0015, # Azure OpenAI version of ChatGPT
"gpt-35-turbo-0301": 0.0015, # Azure OpenAI version of ChatGPT
"gpt-35-turbo-0613": 0.0015,
"gpt-35-turbo-instruct": 0.0015,
"gpt-35-turbo-16k": 0.003,
"gpt-35-turbo-16k-0613": 0.003,
# Azure GPT-35 output
"gpt-35-turbo-completion": 0.002, # Azure OpenAI version of ChatGPT
"gpt-35-turbo-0301-completion": 0.002, # Azure OpenAI version of ChatGPT
"gpt-35-turbo-0613-completion": 0.002,
"gpt-35-turbo-instruct-completion": 0.002,
"gpt-35-turbo-16k-completion": 0.004,
"gpt-35-turbo-16k-0613-completion": 0.004,
# Others
"text-ada-001": 0.0004,
"ada": 0.0004,
"text-babbage-001": 0.0005,
"babbage": 0.0005,
"text-curie-001": 0.002,
"curie": 0.002,
"text-davinci-003": 0.02,
"text-davinci-002": 0.02,
"code-davinci-002": 0.02,
# Fine Tuned input
"babbage-002-finetuned": 0.0016,
"davinci-002-finetuned": 0.012,
"gpt-3.5-turbo-0613-finetuned": 0.012,
# Fine Tuned output
"babbage-002-finetuned-completion": 0.0016,
"davinci-002-finetuned-completion": 0.012,
"gpt-3.5-turbo-0613-finetuned-completion": 0.016,
# Azure Fine Tuned input
"babbage-002-azure-finetuned": 0.0004,
"davinci-002-azure-finetuned": 0.002,
"gpt-35-turbo-0613-azure-finetuned": 0.0015,
# Azure Fine Tuned output
"babbage-002-azure-finetuned-completion": 0.0004,
"davinci-002-azure-finetuned-completion": 0.002,
"gpt-35-turbo-0613-azure-finetuned-completion": 0.002,
# Legacy fine-tuned models
"ada-finetuned-legacy": 0.0016,
"babbage-finetuned-legacy": 0.0024,
"curie-finetuned-legacy": 0.012,
"davinci-finetuned-legacy": 0.12,
}
def standardize_model_name(
model_name: str,
is_completion: bool = False,
) -> str:
"""
Standardize the model name to a format that can be used in the OpenAI API.
Args:
model_name: Model name to standardize.
is_completion: Whether the model is used for completion or not.
Defaults to False.
Returns:
Standardized model name.
"""
model_name = model_name.lower()
if ".ft-" in model_name:
model_name = model_name.split(".ft-")[0] + "-azure-finetuned"
if ":ft-" in model_name:
model_name = model_name.split(":")[0] + "-finetuned-legacy"
if "ft:" in model_name:
model_name = model_name.split(":")[1] + "-finetuned"
if is_completion and (
model_name.startswith("gpt-4")
or model_name.startswith("gpt-3.5")
or model_name.startswith("gpt-35")
or ("finetuned" in model_name and "legacy" not in model_name)
):
return model_name + "-completion"
else:
return model_name
def get_openai_token_cost_for_model(
model_name: str, num_tokens: int, is_completion: bool = False
) -> float:
"""
Get the cost in USD for a given model and number of tokens.
Args:
model_name: Name of the model
num_tokens: Number of tokens.
is_completion: Whether the model is used for completion or not.
Defaults to False.
Returns:
Cost in USD.
"""
model_name = standardize_model_name(model_name, is_completion=is_completion)
if model_name not in MODEL_COST_PER_1K_TOKENS:
raise ValueError(
f"Unknown model: {model_name}. Please provide a valid OpenAI model name."
"Known models are: " + ", ".join(MODEL_COST_PER_1K_TOKENS.keys())
)
return MODEL_COST_PER_1K_TOKENS[model_name] * (num_tokens / 1000)
class OpenAICallbackHandler(BaseCallbackHandler):
"""Callback Handler that tracks OpenAI info."""
total_tokens: int = 0
prompt_tokens: int = 0
completion_tokens: int = 0
successful_requests: int = 0
total_cost: float = 0.0
def __init__(self) -> None:
super().__init__()
self._lock = threading.Lock()
def __repr__(self) -> str:
return (
f"Tokens Used: {self.total_tokens}\n"
f"\tPrompt Tokens: {self.prompt_tokens}\n"
f"\tCompletion Tokens: {self.completion_tokens}\n"
f"Successful Requests: {self.successful_requests}\n"
f"Total Cost (USD): ${self.total_cost}"
)
@property
def always_verbose(self) -> bool:
"""Whether to call verbose callbacks even if verbose is False."""
return True
def on_llm_start(
self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
) -> None:
"""Print out the prompts."""
pass
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
"""Print out the token."""
pass
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
"""Collect token usage."""
if response.llm_output is None:
return None
if "token_usage" not in response.llm_output:
with self._lock:
self.successful_requests += 1
return None
# compute tokens and cost for this request
token_usage = response.llm_output["token_usage"]
completion_tokens = token_usage.get("completion_tokens", 0)
prompt_tokens = token_usage.get("prompt_tokens", 0)
model_name = standardize_model_name(response.llm_output.get("model_name", ""))
if model_name in MODEL_COST_PER_1K_TOKENS:
completion_cost = get_openai_token_cost_for_model(
model_name, completion_tokens, is_completion=True
)
prompt_cost = get_openai_token_cost_for_model(model_name, prompt_tokens)
else:
completion_cost = 0
prompt_cost = 0
# update shared state behind lock
with self._lock:
self.total_cost += prompt_cost + completion_cost
self.total_tokens += token_usage.get("total_tokens", 0)
self.prompt_tokens += prompt_tokens
self.completion_tokens += completion_tokens
self.successful_requests += 1
def __copy__(self) -> "OpenAICallbackHandler":
"""Return a copy of the callback handler."""
return self
def __deepcopy__(self, memo: Any) -> "OpenAICallbackHandler":
"""Return a deep copy of the callback handler."""
return self