diff --git a/assets/arena.html b/assets/arena.html index 6424542..b20553d 100644 --- a/assets/arena.html +++ b/assets/arena.html @@ -746,8 +746,8 @@ messages: messages, stream: true, }; - const { max_output_token, need_max_tokens } = retrieveModel(this.models, chat.model_id); - if (!body["max_tokens"] && need_max_tokens) { + const { max_output_token, pass_max_tokens } = retrieveModel(this.models, chat.model_id); + if (!body["max_tokens"] && pass_max_tokens) { body["max_tokens"] = max_output_token; }; return body; @@ -819,14 +819,14 @@ function retrieveModel(models, id) { const model = models.find(model => model.id === id); if (!model) return {}; - const max_output_token = model.max_output_tokens || model["max_output_tokens?"] || null; - const need_max_tokens = !!model.max_output_tokens; + const max_output_token = model.max_output_tokens; const supports_vision = !!model.supports_vision; + const pass_max_tokens = !!model.pass_max_tokens; return { id, max_output_token, - need_max_tokens, supports_vision, + pass_max_tokens, } } diff --git a/assets/playground.html b/assets/playground.html index 485d7b6..0036fb4 100644 --- a/assets/playground.html +++ b/assets/playground.html @@ -939,8 +939,8 @@ body[body_key || setting_key] = this.settings[setting_key]; } }); - const { max_output_token, need_max_tokens } = this.currentModel; - if (!body["max_tokens"] && need_max_tokens) { + const { max_output_token, pass_max_tokens } = this.currentModel; + if (!body["max_tokens"] && pass_max_tokens) { body["max_tokens"] = max_output_token; }; return body; @@ -1013,14 +1013,14 @@ function retrieveModel(models, id) { const model = models.find(model => model.id === id); if (!model) return {}; - const max_output_token = model.max_output_tokens || model["max_output_tokens?"] || null; - const need_max_tokens = !!model.max_output_tokens; + const max_output_token = model.max_output_tokens; const supports_vision = !!model.supports_vision; + const pass_max_tokens = !!model.pass_max_tokens; return { id, max_output_token, - need_max_tokens, supports_vision, + pass_max_tokens, } } diff --git a/config.example.yaml b/config.example.yaml index a1dc07d..3f4672d 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -33,7 +33,6 @@ clients: # models: # - name: xxxx # The model name # max_input_tokens: 100000 - # max_output_tokens: 4096 # supports_vision: true # extra_fields: # Set custom parameters, will merge with the body json # key: value diff --git a/models.yaml b/models.yaml index 38fb642..0799308 100644 --- a/models.yaml +++ b/models.yaml @@ -8,44 +8,45 @@ models: - name: gpt-3.5-turbo max_input_tokens: 16385 - max_output_tokens?: 4096 + max_output_tokens: 4096 input_price: 0.5 output_price: 1.5 - name: gpt-3.5-turbo-1106 max_input_tokens: 16385 - max_output_tokens?: 4096 + max_output_tokens: 4096 input_price: 1 output_price: 2 - name: gpt-4-turbo - max_input_tokens: 128000 - max_output_tokens?: 4096 - input_price: 10 - output_price: 30 - supports_vision: true - - name: gpt-4-turbo-preview - max_input_tokens: 128000 - max_output_tokens?: 4096 - input_price: 10 - output_price: 30 - - name: gpt-4-1106-preview - max_input_tokens: 128000 - max_output_tokens?: 4096 - input_price: 10 - output_price: 30 - - name: gpt-4-vision-preview max_input_tokens: 128000 max_output_tokens: 4096 input_price: 10 output_price: 30 supports_vision: true + - name: gpt-4-turbo-preview + max_input_tokens: 128000 + max_output_tokens: 4096 + input_price: 10 + output_price: 30 + - name: gpt-4-1106-preview + max_input_tokens: 128000 + max_output_tokens: 4096 + input_price: 10 + output_price: 30 + - name: gpt-4-vision-preview + max_input_tokens: 128000 + max_output_tokens: 4096 + pass_max_tokens: true + input_price: 10 + output_price: 30 + supports_vision: true - name: gpt-4 max_input_tokens: 8192 - max_output_tokens?: 4096 + max_output_tokens: 4096 input_price: 30 output_price: 60 - name: gpt-4-32k max_input_tokens: 32768 - max_output_tokens?: 4096 + max_output_tokens: 4096 input_price: 60 output_price: 120 @@ -59,18 +60,18 @@ models: - name: gemini-1.0-pro-latest max_input_tokens: 30720 - max_output_tokens?: 2048 + max_output_tokens: 2048 input_price: 0.5 output_price: 1.5 - name: gemini-1.0-pro-vision-latest max_input_tokens: 12288 - max_output_tokens?: 4096 + max_output_tokens: 4096 input_price: 0.5 output_price: 1.5 supports_vision: true - name: gemini-1.5-pro-latest max_input_tokens: 1048576 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 7 output_price: 21 supports_vision: true @@ -85,18 +86,21 @@ - name: claude-3-opus-20240229 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 15 output_price: 75 supports_vision: true - name: claude-3-sonnet-20240229 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 3 output_price: 15 supports_vision: true - name: claude-3-haiku-20240307 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.25 output_price: 1.25 supports_vision: true @@ -140,12 +144,12 @@ models: - name: command-r max_input_tokens: 128000 - max_output_tokens?: 4000 + max_output_tokens: 4000 input_price: 0.5 output_price: 1.5 - name: command-r-plus max_input_tokens: 128000 - max_output_tokens?: 4000 + max_output_tokens: 4000 input_price: 3 output_price: 15 @@ -159,28 +163,28 @@ models: - name: llama-3-sonar-small-32k-chat max_input_tokens: 32768 - max_output_tokens?: 32768 + max_output_tokens: 32768 input_price: 0.2 output_price: 0.2 - name: llama-3-sonar-large-32k-chat max_input_tokens: 32768 - max_output_tokens?: 32768 + max_output_tokens: 32768 input_price: 0.6 output_price: 0.6 - name: llama-3-8b-instruct max_input_tokens: 8192 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 0.2 output_price: 0.2 - name: llama-3-70b-instruct max_input_tokens: 8192 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 1 output_price: 1 - name: mixtral-8x7b-instruct max_input_tokens: 16384 - max_output_tokens?: 16384 + max_output_tokens: 16384 input_price: 0.6 output_price: 0.6 @@ -195,22 +199,22 @@ models: - name: llama3-8b-8192 max_input_tokens: 8192 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 0.05 output_price: 0.10 - name: llama3-70b-8192 max_input_tokens: 8192 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 0.59 output_price: 0.79 - name: mixtral-8x7b-32768 max_input_tokens: 32768 - max_output_tokens?: 32768 + max_output_tokens: 32768 input_price: 0.27 output_price: 0.27 - name: gemma-7b-it max_input_tokens: 8192 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 0.10 output_price: 0.10 @@ -224,18 +228,18 @@ models: - name: gemini-1.0-pro max_input_tokens: 24568 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 0.125 output_price: 0.375 - name: gemini-1.0-pro-vision max_input_tokens: 14336 - max_output_tokens?: 2048 + max_output_tokens: 2048 input_price: 0.125 output_price: 0.375 supports_vision: true - name: gemini-1.5-pro-preview-0409 max_input_tokens: 1000000 - max_output_tokens?: 8192 + max_output_tokens: 8192 input_price: 2.5 output_price: 7.5 supports_vision: true @@ -250,18 +254,21 @@ - name: claude-3-opus@20240229 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 15 output_price: 75 supports_vision: true - name: claude-3-sonnet@20240229 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 3 output_price: 15 supports_vision: true - name: claude-3-haiku@20240307 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.25 output_price: 1.25 supports_vision: true @@ -277,44 +284,52 @@ - name: anthropic.claude-3-opus-20240229-v1:0 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 15 output_price: 75 supports_vision: true - name: anthropic.claude-3-sonnet-20240229-v1:0 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 3 output_price: 15 supports_vision: true - name: anthropic.claude-3-haiku-20240307-v1:0 max_input_tokens: 200000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.25 output_price: 1.25 supports_vision: true - name: meta.llama3-8b-instruct-v1:0 max_input_tokens: 8192 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.4 output_price: 0.6 - name: meta.llama3-70b-instruct-v1:0 max_input_tokens: 8192 max_output_tokens: 4096 + pass_max_tokens: true input_price: 2.65 output_price: 3.5 - name: mistral.mistral-7b-instruct-v0:2 max_input_tokens: 32000 max_output_tokens: 8192 + pass_max_tokens: true input_price: 0.15 output_price: 0.2 - name: mistral.mixtral-8x7b-instruct-v0:1 max_input_tokens: 32000 max_output_tokens: 8192 + pass_max_tokens: true input_price: 0.45 output_price: 0.7 - name: mistral.mistral-large-2402-v1:0 max_input_tokens: 32000 max_output_tokens: 8192 + pass_max_tokens: true input_price: 8 output_price: 2.4 @@ -328,21 +343,27 @@ - name: '@cf/meta/llama-3-8b-instruct' max_input_tokens: 4096 max_output_tokens: 4096 + pass_max_tokens: true - name: '@cf/mistral/mistral-7b-instruct-v0.2-lora' max_input_tokens: 4096 max_output_tokens: 4096 + pass_max_tokens: true - name: '@cf/google/gemma-7b-it-lora' max_input_tokens: 4096 max_output_tokens: 4096 + pass_max_tokens: true - name: '@cf/qwen/qwen1.5-14b-chat-awq' max_input_tokens: 4096 max_output_tokens: 4096 + pass_max_tokens: true - name: '@hf/thebloke/deepseek-coder-6.7b-instruct-awq' max_input_tokens: 4096 max_output_tokens: 4096 + pass_max_tokens: true - name: '@hf/nexusflow/starling-lm-7b-beta' max_input_tokens: 4096 max_output_tokens: 4096 + pass_max_tokens: true - platform: replicate # docs: @@ -354,21 +375,25 @@ - name: meta/meta-llama-3-70b-instruct max_input_tokens: 8192 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.65 output_price: 2.75 - name: meta/meta-llama-3-8b-instruct max_input_tokens: 8192 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.05 output_price: 0.25 - name: mistralai/mistral-7b-instruct-v0.2 max_input_tokens: 32000 max_output_tokens: 8192 + pass_max_tokens: true input_price: 0.05 output_price: 0.25 - name: mistralai/mixtral-8x7b-instruct-v0.1 max_input_tokens: 32000 max_output_tokens: 8192 + pass_max_tokens: true input_price: 0.3 output_price: 1 @@ -382,26 +407,31 @@ - name: ernie-4.0-8k-preview max_input_tokens: 5120 max_output_tokens: 2048 + pass_max_tokens: true input_price: 16.8 output_price: 16.8 - name: ernie-3.5-8k-preview max_input_tokens: 5120 max_output_tokens: 2048 + pass_max_tokens: true input_price: 1.68 output_price: 1.68 - name: ernie-speed-128k max_input_tokens: 124000 max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.56 output_price: 1.12 - name: ernie-lite-8k max_input_tokens: 7168 max_output_tokens: 2048 + pass_max_tokens: true input_price: 0.42 output_price: 0.84 - name: ernie-tiny-8k max_input_tokens: 7168 max_output_tokens: 2048 + pass_max_tokens: true input_price: 0.14 output_price: 0.14 @@ -414,22 +444,22 @@ models: - name: qwen-turbo max_input_tokens: 6000 - max_output_tokens?: 1500 + max_output_tokens: 1500 input_price: 1.12 output_price: 1.12 - name: qwen-plus max_input_tokens: 30000 - max_output_tokens?: 2000 + max_output_tokens: 2000 input_price: 2.8 output_price: 2.8 - name: qwen-max max_input_tokens: 6000 - max_output_tokens?: 2000 + max_output_tokens: 2000 input_price: 16.8 output_price: 16.8 - name: qwen-max-longcontext max_input_tokens: 28000 - max_output_tokens?: 2000 + max_output_tokens: 2000 - name: qwen-vl-plus input_price: 1.12 output_price: 1.12 @@ -686,16 +716,22 @@ supports_vision: true - name: anthropic/claude-3-opus max_input_tokens: 200000 + max_output_tokens: 4096 + pass_max_tokens: true input_price: 15 output_price: 75 supports_vision: true - name: anthropic/claude-3-sonnet max_input_tokens: 200000 + max_output_tokens: 4096 + pass_max_tokens: true input_price: 3 output_price: 15 supports_vision: true - name: anthropic/claude-3-haiku max_input_tokens: 200000 + max_output_tokens: 4096 + pass_max_tokens: true input_price: 0.25 output_price: 1.25 supports_vision: true diff --git a/src/client/bedrock.rs b/src/client/bedrock.rs index 6abd939..b07152b 100644 --- a/src/client/bedrock.rs +++ b/src/client/bedrock.rs @@ -172,7 +172,7 @@ async fn send_message_streaming( let data: Value = decode_chunk(message.payload()).ok_or_else(|| { anyhow!("Invalid chunk data: {}", hex_encode(message.payload())) })?; - debug!("bedrock chunk: {data}"); + // debug!("bedrock chunk: {data}"); match model_category { ModelCategory::Anthropic => { if let Some(typ) = data["type"].as_str() { @@ -235,7 +235,7 @@ fn meta_llama_build_body(data: SendData, model: &Model, pt: PromptFormat) -> Res let prompt = generate_prompt(&messages, pt)?; let mut body = json!({ "prompt": prompt }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_gen_len"] = v.into(); } if let Some(v) = temperature { @@ -258,7 +258,7 @@ fn mistral_build_body(data: SendData, model: &Model) -> Result { let prompt = generate_prompt(&messages, MISTRAL_PROMPT_FORMAT)?; let mut body = json!({ "prompt": prompt }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/claude.rs b/src/client/claude.rs index 0a230e9..89742f3 100644 --- a/src/client/claude.rs +++ b/src/client/claude.rs @@ -142,7 +142,7 @@ pub fn claude_build_body(data: SendData, model: &Model) -> Result { if let Some(v) = system_message { body["system"] = v.into(); } - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/cloudflare.rs b/src/client/cloudflare.rs index 9758032..5a4bf8c 100644 --- a/src/client/cloudflare.rs +++ b/src/client/cloudflare.rs @@ -88,7 +88,7 @@ fn build_body(data: SendData, model: &Model) -> Result { "messages": messages, }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/cohere.rs b/src/client/cohere.rs index e0ef6f0..b5d6647 100644 --- a/src/client/cohere.rs +++ b/src/client/cohere.rs @@ -135,7 +135,7 @@ fn build_body(data: SendData, model: &Model) -> Result { body["chat_history"] = messages.into(); } - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/ernie.rs b/src/client/ernie.rs index d3002ff..28cb857 100644 --- a/src/client/ernie.rs +++ b/src/client/ernie.rs @@ -128,7 +128,7 @@ fn build_body(data: SendData, model: &Model) -> Value { "messages": messages, }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_output_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/model.rs b/src/client/model.rs index b24a546..d0d86e4 100644 --- a/src/client/model.rs +++ b/src/client/model.rs @@ -14,11 +14,11 @@ pub struct Model { pub name: String, pub max_input_tokens: Option, pub max_output_tokens: Option, - pub ref_max_output_tokens: Option, + pub pass_max_tokens: bool, pub input_price: Option, pub output_price: Option, - pub extra_fields: Option>, pub capabilities: ModelCapabilities, + pub extra_fields: Option>, } impl Default for Model { @@ -32,13 +32,13 @@ impl Model { Self { client_name: client_name.into(), name: name.into(), - extra_fields: None, max_input_tokens: None, max_output_tokens: None, - ref_max_output_tokens: None, + pass_max_tokens: false, input_price: None, output_price: None, capabilities: ModelCapabilities::Text, + extra_fields: None, } } @@ -49,8 +49,7 @@ impl Model { let mut model = Model::new(client_name, &v.name); model .set_max_input_tokens(v.max_input_tokens) - .set_max_output_tokens(v.max_output_tokens) - .set_ref_max_output_tokens(v.ref_max_output_tokens) + .set_max_tokens(v.max_output_tokens, v.pass_max_tokens) .set_input_price(v.input_price) .set_output_price(v.output_price) .set_supports_vision(v.supports_vision) @@ -97,7 +96,7 @@ impl Model { pub fn description(&self) -> String { let max_input_tokens = format_option_value(&self.max_input_tokens); - let max_output_tokens = format_option_value(&self.show_max_output_tokens()); + let max_output_tokens = format_option_value(&self.max_output_tokens); let input_price = format_option_value(&self.input_price); let output_price = format_option_value(&self.output_price); let vision = if self.capabilities.contains(ModelCapabilities::Vision) { @@ -115,8 +114,12 @@ impl Model { self.capabilities.contains(ModelCapabilities::Vision) } - pub fn show_max_output_tokens(&self) -> Option { - self.max_output_tokens.or(self.ref_max_output_tokens) + pub fn max_tokens_param(&self) -> Option { + if self.pass_max_tokens { + self.max_output_tokens + } else { + None + } } pub fn set_max_input_tokens(&mut self, max_input_tokens: Option) -> &mut Self { @@ -127,19 +130,16 @@ impl Model { self } - pub fn set_max_output_tokens(&mut self, max_output_tokens: Option) -> &mut Self { + pub fn set_max_tokens( + &mut self, + max_output_tokens: Option, + pass_max_tokens: bool, + ) -> &mut Self { match max_output_tokens { None | Some(0) => self.max_output_tokens = None, _ => self.max_output_tokens = max_output_tokens, } - self - } - - pub fn set_ref_max_output_tokens(&mut self, ref_max_output_tokens: Option) -> &mut Self { - match ref_max_output_tokens { - None | Some(0) => self.ref_max_output_tokens = None, - _ => self.ref_max_output_tokens = ref_max_output_tokens, - } + self.pass_max_tokens = pass_max_tokens; self } @@ -237,12 +237,12 @@ pub struct ModelConfig { pub name: String, pub max_input_tokens: Option, pub max_output_tokens: Option, - #[serde(rename = "max_output_tokens?")] - pub ref_max_output_tokens: Option, pub input_price: Option, pub output_price: Option, #[serde(default)] pub supports_vision: bool, + #[serde(default)] + pub pass_max_tokens: bool, pub extra_fields: Option>, } diff --git a/src/client/ollama.rs b/src/client/ollama.rs index b61417a..6408d2e 100644 --- a/src/client/ollama.rs +++ b/src/client/ollama.rs @@ -159,7 +159,7 @@ fn build_body(data: SendData, model: &Model) -> Result { "options": {}, }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["options"]["num_predict"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/openai.rs b/src/client/openai.rs index 08bb94d..0b111db 100644 --- a/src/client/openai.rs +++ b/src/client/openai.rs @@ -90,7 +90,7 @@ pub fn openai_build_body(data: SendData, model: &Model) -> Value { "messages": messages, }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["max_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/qianwen.rs b/src/client/qianwen.rs index 3fa17e6..7391a38 100644 --- a/src/client/qianwen.rs +++ b/src/client/qianwen.rs @@ -173,7 +173,7 @@ fn build_body(data: SendData, model: &Model, is_vl: bool) -> Result<(Value, bool parameters["incremental_output"] = true.into(); } - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { parameters["max_tokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/client/replicate.rs b/src/client/replicate.rs index a20ce71..34cfd94 100644 --- a/src/client/replicate.rs +++ b/src/client/replicate.rs @@ -148,7 +148,7 @@ fn build_body(data: SendData, model: &Model) -> Result { "prompt_template": "{prompt}" }); - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { input["max_tokens"] = v.into(); input["max_new_tokens"] = v.into(); } diff --git a/src/client/vertexai.rs b/src/client/vertexai.rs index 9907a7f..4d06934 100644 --- a/src/client/vertexai.rs +++ b/src/client/vertexai.rs @@ -201,7 +201,7 @@ pub(crate) fn gemini_build_body( body["safetySettings"] = safety_settings; } - if let Some(v) = model.max_output_tokens { + if let Some(v) = model.max_tokens_param() { body["generationConfig"]["maxOutputTokens"] = v.into(); } if let Some(v) = temperature { diff --git a/src/config/mod.rs b/src/config/mod.rs index f33ee73..5ac44a8 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -422,7 +422,7 @@ impl Config { ( "max_output_tokens", self.model - .max_output_tokens + .max_tokens_param() .map(|v| format!("{v} (current model)")) .unwrap_or_else(|| "-".into()), ), @@ -523,7 +523,7 @@ impl Config { (values, args[0]) } else if args.len() == 2 { let values = match args[0] { - "max_output_tokens" => match self.model.show_max_output_tokens() { + "max_output_tokens" => match self.model.max_output_tokens { Some(v) => vec![v.to_string()], None => vec![], }, @@ -564,7 +564,7 @@ impl Config { match key { "max_output_tokens" => { let value = parse_value(value)?; - self.model.set_max_output_tokens(value); + self.model.set_max_tokens(value, true); } "temperature" => { let value = parse_value(value)?; diff --git a/src/serve.rs b/src/serve.rs index 5f748e8..4413c89 100644 --- a/src/serve.rs +++ b/src/serve.rs @@ -93,7 +93,7 @@ impl Server { "id": id, "max_input_tokens": model.max_input_tokens, "max_output_tokens": model.max_output_tokens, - "max_output_tokens?": model.ref_max_output_tokens, + "pass_max_tokens": model.pass_max_tokens, "input_price": model.input_price, "output_price": model.output_price, "supports_vision": model.supports_vision(), @@ -244,7 +244,7 @@ impl Server { let mut client = init_client(&config)?; if max_tokens.is_some() { - client.model_mut().set_max_output_tokens(max_tokens); + client.model_mut().set_max_tokens(max_tokens, true); } let abort = create_abort_signal(); let http_client = client.build_client()?;