feat: abandon replicate client (#900)

2 weeks ago · e009f2e241
parent 419c626485
commit e009f2e241
8 changed files with 2 additions and 422 deletions
--- a/Argcfile.sh
+++ b/Argcfile.sh
@ -274,43 +274,6 @@ chat-vertexai() {
 -d "$(_build_body vertexai "$@")" 
 }
 # @cmd Chat with replicate api
 # @env REPLICATE_API_KEY!
 # @option -m --model=meta/meta-llama-3-8b-instruct $REPLICATE_MODEL
 # @flag -S --no-stream
 # @arg text~
 chat-replicate() {
    url="https://api.replicate.com/v1/models/$argc_model/predictions"
    res="$(_wrapper curl -s "$url" \
 -X POST \
 -H "Authorization: Bearer $REPLICATE_API_KEY" \
 -H "Content-Type: application/json" \
 -d "$(_build_body replicate "$@")" \
 )"
    echo "$res"
    if [[ -n "$argc_no_stream" ]]; then
        prediction_url="$(echo "$res" | jq -r '.urls.get')"
        while true; do
            output="$(_wrapper curl -s -H "Authorization: Bearer $REPLICATE_API_KEY" "$prediction_url")"
            prediction_status=$(printf "%s" "$output" | jq -r .status)
            if [ "$prediction_status"=="succeeded" ]; then
                echo "$output"
                break
            fi
            if [ "$prediction_status"=="failed" ]; then
                exit 1
            fi
            sleep 2
        done
    else
        stream_url="$(echo "$res" | jq -r '.urls.stream')"
    _wrapper curl -i --no-buffer "$stream_url" \
 -H "Accept: text/event-stream" \
    fi
 }
 # @cmd Chat with ernie api
 # @meta require-tools jq
 # @env ERNIE_API_KEY!
@ -367,7 +330,7 @@ _choice_platform() {
 }
 _choice_client() {
-    printf "%s\n" openai gemini claude cohere ollama azure-openai vertexai bedrock cloudflare replicate ernie qianwen moonshot
+    printf "%s\n" openai gemini claude cohere ollama azure-openai vertexai bedrock cloudflare ernie qianwen moonshot
 }
 _choice_openai_compatible_platform() {
@ -445,14 +408,6 @@ _build_body() {
        }
    ],
    "stream": '$stream'
 }'
            ;;
        replicate)
            echo '{
    "stream": '$stream',
 	"input": {
      "prompt": "'"$*"'"
 	}
 }'
            ;;
        *)
--- a/README.md
+++ b/README.md
@ -50,7 +50,6 @@ Effortlessly connect with over 20 leading LLM platforms through a unified interf
 - **Perplexity:** Llama-3/Mixtral (paid, chat, online)
 - **Cloudflare:** (free, chat, embedding)
 - **OpenRouter:** (paid, chat, function-calling)
 - **Replicate:** (paid, chat)
 - **Ernie:** (paid, chat, embedding, reranker, function-calling)
 - **Qianwen:** Qwen (paid, chat, embedding, vision, function-calling)
 - **Moonshot:** (paid, chat, function-calling)
--- a/config.example.yaml
+++ b/config.example.yaml
@ -237,10 +237,6 @@ clients:
    api_base: https://api-inference.huggingface.co/v1
    api_key: xxx
  # See https://replicate.com/docs
  - type: replicate
    api_key: xxx
  # See https://cloud.baidu.com/doc/WENXINWORKSHOP/index.html
  - type: ernie
    api_key: xxx
--- a/models.yaml
+++ b/models.yaml
@ -642,30 +642,6 @@
      input_price: 0
      output_price: 0
 # Links:
 #  - https://replicate.com/explore
 #  - https://replicate.com/pricing
 #  - https://replicate.com/docs/reference/http#create-a-prediction-using-an-official-model
 - platform: replicate
  models:
    - name: meta/meta-llama-3.1-405b-instruct
      max_input_tokens: 128000
      max_output_tokens: 4096
      input_price: 9.5
      output_price: 9.5
    - name: meta/meta-llama-3-70b-instruct
      max_input_tokens: 8192
      max_output_tokens: 4096
      require_max_tokens: true
      input_price: 0.65
      output_price: 2.75
    - name: meta/meta-llama-3-8b-instruct
      max_input_tokens: 8192
      max_output_tokens: 4096
      require_max_tokens: true
      input_price: 0.05
      output_price: 0.25
 # Links:
 #  - https://cloud.baidu.com/doc/WENXINWORKSHOP/s/Nlks5zkzu
 #  - https://cloud.baidu.com/doc/WENXINWORKSHOP/s/hlrk4akp7
--- a/src/client/mod.rs
+++ b/src/client/mod.rs
@ -4,7 +4,6 @@ mod message;
 #[macro_use]
 mod macros;
 mod model;
 mod prompt_format;
 mod stream;
 pub use crate::function::{ToolCall, ToolResults};
@ -33,7 +32,6 @@ register_client!(
    ),
    (vertexai, "vertexai", VertexAIConfig, VertexAIClient),
    (bedrock, "bedrock", BedrockConfig, BedrockClient),
    (replicate, "replicate", ReplicateConfig, ReplicateClient),
    (ernie, "ernie", ErnieConfig, ErnieClient),
 );
--- a/src/client/prompt_format.rs
+++ b/src/client/prompt_format.rs
@ -1,150 +0,0 @@
 use super::message::*;
 pub struct PromptFormat<'a> {
    pub begin: &'a str,
    pub system_pre_message: &'a str,
    pub system_post_message: &'a str,
    pub user_pre_message: &'a str,
    pub user_post_message: &'a str,
    pub assistant_pre_message: &'a str,
    pub assistant_post_message: &'a str,
    pub end: &'a str,
 }
 pub const GENERIC_PROMPT_FORMAT: PromptFormat<'static> = PromptFormat {
    begin: "",
    system_pre_message: "",
    system_post_message: "\n",
    user_pre_message: "### Instruction:\n",
    user_post_message: "\n",
    assistant_pre_message: "### Response:\n",
    assistant_post_message: "\n",
    end: "### Response:\n",
 };
 pub const MISTRAL_PROMPT_FORMAT: PromptFormat<'static> = PromptFormat {
    begin: "",
    system_pre_message: "[INST] <<SYS>>",
    system_post_message: "<</SYS>> [/INST]",
    user_pre_message: "[INST]",
    user_post_message: "[/INST]",
    assistant_pre_message: "",
    assistant_post_message: "",
    end: "",
 };
 pub const LLAMA3_PROMPT_FORMAT: PromptFormat<'static> = PromptFormat {
    begin: "<|begin_of_text|>",
    system_pre_message: "<|start_header_id|>system<|end_header_id|>\n\n",
    system_post_message: "<|eot_id|>",
    user_pre_message: "<|start_header_id|>user<|end_header_id|>\n\n",
    user_post_message: "<|eot_id|>",
    assistant_pre_message: "<|start_header_id|>assistant<|end_header_id|>\n\n",
    assistant_post_message: "<|eot_id|>",
    end: "<|start_header_id|>assistant<|end_header_id|>\n\n",
 };
 pub const PHI3_PROMPT_FORMAT: PromptFormat<'static> = PromptFormat {
    begin: "",
    system_pre_message: "<|system|>\n",
    system_post_message: "<|end|>\n",
    user_pre_message: "<|user|>\n",
    user_post_message: "<|end|>\n",
    assistant_pre_message: "<|assistant|>\n",
    assistant_post_message: "<|end|>\n",
    end: "<|assistant|>\n",
 };
 pub const COMMAND_R_PROMPT_FORMAT: PromptFormat<'static> = PromptFormat {
    begin: "",
    system_pre_message: "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
    system_post_message: "<|END_OF_TURN_TOKEN|>",
    user_pre_message: "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>",
    user_post_message: "<|END_OF_TURN_TOKEN|>",
    assistant_pre_message: "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
    assistant_post_message: "<|END_OF_TURN_TOKEN|>",
    end: "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
 };
 pub const QWEN_PROMPT_FORMAT: PromptFormat<'static> = PromptFormat {
    begin: "",
    system_pre_message: "<|im_start|>system\n",
    system_post_message: "<|im_end|>",
    user_pre_message: "<|im_start|>user\n",
    user_post_message: "<|im_end|>",
    assistant_pre_message: "<|im_start|>assistant\n",
    assistant_post_message: "<|im_end|>",
    end: "<|im_start|>assistant\n",
 };
 pub fn generate_prompt(messages: &[Message], format: PromptFormat) -> anyhow::Result<String> {
    let PromptFormat {
        begin,
        system_pre_message,
        system_post_message,
        user_pre_message,
        user_post_message,
        assistant_pre_message,
        assistant_post_message,
        end,
    } = format;
    let mut prompt = begin.to_string();
    let mut image_urls = vec![];
    for message in messages {
        let role = &message.role;
        let content = match &message.content {
            MessageContent::Text(text) => text.clone(),
            MessageContent::Array(list) => {
                let mut parts = vec![];
                for item in list {
                    match item {
                        MessageContentPart::Text { text } => parts.push(text.clone()),
                        MessageContentPart::ImageUrl {
                            image_url: ImageUrl { url },
                        } => {
                            image_urls.push(url.clone());
                        }
                    }
                }
                parts.join("\n\n")
            }
            MessageContent::ToolResults(_) => String::new(),
        };
        match role {
            MessageRole::System => prompt.push_str(&format!(
                "{system_pre_message}{content}{system_post_message}"
            )),
            MessageRole::Assistant => prompt.push_str(&format!(
                "{assistant_pre_message}{content}{assistant_post_message}"
            )),
            MessageRole::User => {
                prompt.push_str(&format!("{user_pre_message}{content}{user_post_message}"))
            }
        }
    }
    if !image_urls.is_empty() {
        anyhow::bail!("The model does not support images: {:?}", image_urls);
    }
    prompt.push_str(end);
    Ok(prompt)
 }
 pub fn smart_prompt_format(model_name: &str) -> PromptFormat<'static> {
    if model_name.contains("llama3") || model_name.contains("llama-3") {
        LLAMA3_PROMPT_FORMAT
    } else if model_name.contains("llama2")
        || model_name.contains("llama-2")
        || model_name.contains("mistral")
        || model_name.contains("mixtral")
    {
        MISTRAL_PROMPT_FORMAT
    } else if model_name.contains("phi3") || model_name.contains("phi-3") {
        PHI3_PROMPT_FORMAT
    } else if model_name.contains("command-r") {
        COMMAND_R_PROMPT_FORMAT
    } else if model_name.contains("qwen") {
        QWEN_PROMPT_FORMAT
    } else {
        GENERIC_PROMPT_FORMAT
    }
 }
--- a/src/client/replicate.rs
+++ b/src/client/replicate.rs
@ -1,195 +0,0 @@
 use super::prompt_format::*;
 use super::*;
 use anyhow::{anyhow, Result};
 use reqwest::{Client as ReqwestClient, RequestBuilder};
 use serde::Deserialize;
 use serde_json::{json, Value};
 use std::time::Duration;
 const API_BASE: &str = "https://api.replicate.com/v1";
 #[derive(Debug, Clone, Deserialize, Default)]
 pub struct ReplicateConfig {
    pub name: Option<String>,
    pub api_key: Option<String>,
    #[serde(default)]
    pub models: Vec<ModelData>,
    pub patch: Option<RequestPatch>,
    pub extra: Option<ExtraConfig>,
 }
 impl ReplicateClient {
    config_get_fn!(api_key, get_api_key);
    pub const PROMPTS: [PromptAction<'static>; 1] =
        [("api_key", "API Key:", true, PromptKind::String)];
 }
 #[async_trait::async_trait]
 impl Client for ReplicateClient {
    client_common_fns!();
    async fn chat_completions_inner(
        &self,
        client: &ReqwestClient,
        data: ChatCompletionsData,
    ) -> Result<ChatCompletionsOutput> {
        let request_data = prepare_chat_completions(self, data)?;
        let builder = self.request_builder(client, request_data, ApiType::ChatCompletions);
        chat_completions(builder, client, &self.get_api_key()?).await
    }
    async fn chat_completions_streaming_inner(
        &self,
        client: &ReqwestClient,
        handler: &mut SseHandler,
        data: ChatCompletionsData,
    ) -> Result<()> {
        let request_data = prepare_chat_completions(self, data)?;
        let builder = self.request_builder(client, request_data, ApiType::ChatCompletions);
        chat_completions_streaming(builder, handler, client).await
    }
 }
 fn prepare_chat_completions(
    self_: &ReplicateClient,
    data: ChatCompletionsData,
 ) -> Result<RequestData> {
    let api_key = self_.get_api_key()?;
    let url = format!("{API_BASE}/models/{}/predictions", self_.model.name());
    let body = build_chat_completions_body(data, &self_.model)?;
    let mut request_data = RequestData::new(url, body);
    request_data.bearer_auth(api_key);
    Ok(request_data)
 }
 async fn chat_completions(
    builder: RequestBuilder,
    client: &ReqwestClient,
    api_key: &str,
 ) -> Result<ChatCompletionsOutput> {
    let res = builder.send().await?;
    let status = res.status();
    let data: Value = res.json().await?;
    if !status.is_success() {
        catch_error(&data, status.as_u16())?;
    }
    let prediction_url = data["urls"]["get"]
        .as_str()
        .ok_or_else(|| anyhow!("Invalid response data: {data}"))?;
    loop {
        tokio::time::sleep(Duration::from_millis(500)).await;
        let prediction_data: Value = client
            .get(prediction_url)
            .bearer_auth(api_key)
            .send()
            .await?
            .json()
            .await?;
        debug!("non-stream-data: {prediction_data}");
        let err = || anyhow!("Invalid response data: {prediction_data}");
        let status = prediction_data["status"].as_str().ok_or_else(err)?;
        if status == "succeeded" {
            return extract_chat_completions(&prediction_data);
        } else if status == "failed" || status == "canceled" {
            return Err(err());
        }
    }
 }
 async fn chat_completions_streaming(
    builder: RequestBuilder,
    handler: &mut SseHandler,
    client: &ReqwestClient,
 ) -> Result<()> {
    let res = builder.send().await?;
    let status = res.status();
    let data: Value = res.json().await?;
    if !status.is_success() {
        catch_error(&data, status.as_u16())?;
    }
    let stream_url = data["urls"]["stream"]
        .as_str()
        .ok_or_else(|| anyhow!("Invalid response data: {data}"))?;
    let sse_builder = client.get(stream_url).header("accept", "text/event-stream");
    let handle = |message: SseMmessage| -> Result<bool> {
        if message.event == "done" {
            return Ok(true);
        }
        debug!("stream-data: {}", message.data);
        handler.text(&message.data)?;
        Ok(false)
    };
    sse_stream(sse_builder, handle).await
 }
 fn build_chat_completions_body(data: ChatCompletionsData, model: &Model) -> Result<Value> {
    let ChatCompletionsData {
        messages,
        temperature,
        top_p,
        functions: _,
        stream,
    } = data;
    let prompt = generate_prompt(&messages, smart_prompt_format(model.name()))?;
    let mut input = json!({
        "prompt": prompt,
        "prompt_template": "{prompt}"
    });
    if let Some(v) = model.max_tokens_param() {
        input["max_tokens"] = v.into();
        input["max_new_tokens"] = v.into();
    }
    if let Some(v) = temperature {
        input["temperature"] = v.into();
    }
    if let Some(v) = top_p {
        input["top_p"] = v.into();
    }
    let mut body = json!({
        "input": input,
    });
    if stream {
        body["stream"] = true.into();
    }
    Ok(body)
 }
 fn extract_chat_completions(data: &Value) -> Result<ChatCompletionsOutput> {
    let text = data["output"]
        .as_array()
        .map(|parts| {
            parts
                .iter()
                .filter_map(|v| v.as_str().map(|v| v.to_string()))
                .collect::<Vec<String>>()
                .join("")
        })
        .ok_or_else(|| anyhow!("Invalid response data: {data}"))?;
    let output = ChatCompletionsOutput {
        text: text.to_string(),
        tool_calls: vec![],
        id: data["id"].as_str().map(|v| v.to_string()),
        input_tokens: data["metrics"]["input_token_count"].as_u64(),
        output_tokens: data["metrics"]["output_token_count"].as_u64(),
    };
    Ok(output)
 }
--- a/src/client/stream.rs
+++ b/src/client/stream.rs
@ -85,6 +85,7 @@ pub enum SseEvent {
 #[derive(Debug)]
 pub struct SseMmessage {
    #[allow(unused)]
    pub event: String,
    pub data: String,
 }