diff --git a/application/llm/anthropic.py b/application/llm/anthropic.py index 6b0d646..70495f0 100644 --- a/application/llm/anthropic.py +++ b/application/llm/anthropic.py @@ -1,18 +1,22 @@ from application.llm.base import BaseLLM from application.core.settings import settings + class AnthropicLLM(BaseLLM): def __init__(self, api_key=None): from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT - self.api_key = api_key or settings.ANTHROPIC_API_KEY # If not provided, use a default from settings + + self.api_key = ( + api_key or settings.ANTHROPIC_API_KEY + ) # If not provided, use a default from settings self.anthropic = Anthropic(api_key=self.api_key) self.HUMAN_PROMPT = HUMAN_PROMPT self.AI_PROMPT = AI_PROMPT - def gen(self, model, messages, max_tokens=300, stream=False, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + def _raw_gen(self, model, messages, max_tokens=300, stream=False, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Context \n {context} \n ### Question \n {user_question}" if stream: return self.gen_stream(model, prompt, max_tokens, **kwargs) @@ -25,9 +29,9 @@ class AnthropicLLM(BaseLLM): ) return completion.completion - def gen_stream(self, model, messages, max_tokens=300, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + def _raw_gen_stream(self, model, messages, max_tokens=300, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Context \n {context} \n ### Question \n {user_question}" stream_response = self.anthropic.completions.create( model=model, @@ -37,4 +41,4 @@ class AnthropicLLM(BaseLLM): ) for completion in stream_response: - yield completion.completion \ No newline at end of file + yield completion.completion diff --git a/application/llm/huggingface.py b/application/llm/huggingface.py index 554bee2..c9e500e 100644 --- a/application/llm/huggingface.py +++ b/application/llm/huggingface.py @@ -1,44 +1,57 @@ from application.llm.base import BaseLLM + class HuggingFaceLLM(BaseLLM): - def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B',q=False): + def __init__(self, api_key, llm_name="Arc53/DocsGPT-7B", q=False): global hf - + from langchain.llms import HuggingFacePipeline + if q: import torch - from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig + from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + pipeline, + BitsAndBytesConfig, + ) + tokenizer = AutoTokenizer.from_pretrained(llm_name) bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16 - ) - model = AutoModelForCausalLM.from_pretrained(llm_name,quantization_config=bnb_config) + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + ) + model = AutoModelForCausalLM.from_pretrained( + llm_name, quantization_config=bnb_config + ) else: from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline + tokenizer = AutoTokenizer.from_pretrained(llm_name) model = AutoModelForCausalLM.from_pretrained(llm_name) - + pipe = pipeline( - "text-generation", model=model, - tokenizer=tokenizer, max_new_tokens=2000, - device_map="auto", eos_token_id=tokenizer.eos_token_id + "text-generation", + model=model, + tokenizer=tokenizer, + max_new_tokens=2000, + device_map="auto", + eos_token_id=tokenizer.eos_token_id, ) hf = HuggingFacePipeline(pipeline=pipe) - def gen(self, model, messages, stream=False, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + def _raw_gen(self, model, messages, stream=False, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" result = hf(prompt) return result.content - def gen_stream(self, model, messages, stream=True, **kwargs): + def _raw_gen_stream(self, model, messages, stream=True, **kwargs): raise NotImplementedError("HuggingFaceLLM Streaming is not implemented yet.") - diff --git a/application/llm/llama_cpp.py b/application/llm/llama_cpp.py index be34d4f..1512cd7 100644 --- a/application/llm/llama_cpp.py +++ b/application/llm/llama_cpp.py @@ -1,6 +1,7 @@ from application.llm.base import BaseLLM from application.core.settings import settings + class LlamaCpp(BaseLLM): def __init__(self, api_key, llm_name=settings.MODEL_PATH, **kwargs): @@ -8,25 +9,27 @@ class LlamaCpp(BaseLLM): try: from llama_cpp import Llama except ImportError: - raise ImportError("Please install llama_cpp using pip install llama-cpp-python") + raise ImportError( + "Please install llama_cpp using pip install llama-cpp-python" + ) llama = Llama(model_path=llm_name, n_ctx=2048) - def gen(self, model, messages, stream=False, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + def _raw_gen(self, model, messages, stream=False, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" result = llama(prompt, max_tokens=150, echo=False) # import sys # print(result['choices'][0]['text'].split('### Answer \n')[-1], file=sys.stderr) - - return result['choices'][0]['text'].split('### Answer \n')[-1] - def gen_stream(self, model, messages, stream=True, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + return result["choices"][0]["text"].split("### Answer \n")[-1] + + def _raw_gen_stream(self, model, messages, stream=True, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" result = llama(prompt, max_tokens=150, echo=False, stream=stream) @@ -35,5 +38,5 @@ class LlamaCpp(BaseLLM): # print(list(result), file=sys.stderr) for item in result: - for choice in item['choices']: - yield choice['text'] + for choice in item["choices"]: + yield choice["text"] diff --git a/application/llm/openai.py b/application/llm/openai.py index 4b0ed25..de29246 100644 --- a/application/llm/openai.py +++ b/application/llm/openai.py @@ -1,36 +1,49 @@ from application.llm.base import BaseLLM from application.core.settings import settings + class OpenAILLM(BaseLLM): def __init__(self, api_key): global openai from openai import OpenAI - + self.client = OpenAI( - api_key=api_key, - ) + api_key=api_key, + ) self.api_key = api_key def _get_openai(self): # Import openai when needed import openai - + return openai - def gen(self, model, messages, stream=False, engine=settings.AZURE_DEPLOYMENT_NAME, **kwargs): - response = self.client.chat.completions.create(model=model, - messages=messages, - stream=stream, - **kwargs) + def _raw_gen( + self, + model, + messages, + stream=False, + engine=settings.AZURE_DEPLOYMENT_NAME, + **kwargs + ): + response = self.client.chat.completions.create( + model=model, messages=messages, stream=stream, **kwargs + ) return response.choices[0].message.content - def gen_stream(self, model, messages, stream=True, engine=settings.AZURE_DEPLOYMENT_NAME, **kwargs): - response = self.client.chat.completions.create(model=model, - messages=messages, - stream=stream, - **kwargs) + def _raw_gen_stream( + self, + model, + messages, + stream=True, + engine=settings.AZURE_DEPLOYMENT_NAME, + **kwargs + ): + response = self.client.chat.completions.create( + model=model, messages=messages, stream=stream, **kwargs + ) for line in response: # import sys @@ -41,14 +54,17 @@ class OpenAILLM(BaseLLM): class AzureOpenAILLM(OpenAILLM): - def __init__(self, openai_api_key, openai_api_base, openai_api_version, deployment_name): + def __init__( + self, openai_api_key, openai_api_base, openai_api_version, deployment_name + ): super().__init__(openai_api_key) - self.api_base = settings.OPENAI_API_BASE, - self.api_version = settings.OPENAI_API_VERSION, - self.deployment_name = settings.AZURE_DEPLOYMENT_NAME, + self.api_base = (settings.OPENAI_API_BASE,) + self.api_version = (settings.OPENAI_API_VERSION,) + self.deployment_name = (settings.AZURE_DEPLOYMENT_NAME,) from openai import AzureOpenAI + self.client = AzureOpenAI( - api_key=openai_api_key, + api_key=openai_api_key, api_version=settings.OPENAI_API_VERSION, api_base=settings.OPENAI_API_BASE, deployment_name=settings.AZURE_DEPLOYMENT_NAME, diff --git a/application/llm/premai.py b/application/llm/premai.py index 5faa5fe..c0552ea 100644 --- a/application/llm/premai.py +++ b/application/llm/premai.py @@ -1,32 +1,35 @@ from application.llm.base import BaseLLM from application.core.settings import settings + class PremAILLM(BaseLLM): def __init__(self, api_key): from premai import Prem - - self.client = Prem( - api_key=api_key - ) + + self.client = Prem(api_key=api_key) self.api_key = api_key self.project_id = settings.PREMAI_PROJECT_ID - def gen(self, model, messages, stream=False, **kwargs): - response = self.client.chat.completions.create(model=model, + def _raw_gen(self, model, messages, stream=False, **kwargs): + response = self.client.chat.completions.create( + model=model, project_id=self.project_id, messages=messages, stream=stream, - **kwargs) + **kwargs + ) return response.choices[0].message["content"] - def gen_stream(self, model, messages, stream=True, **kwargs): - response = self.client.chat.completions.create(model=model, + def _raw_gen_stream(self, model, messages, stream=True, **kwargs): + response = self.client.chat.completions.create( + model=model, project_id=self.project_id, messages=messages, stream=stream, - **kwargs) + **kwargs + ) for line in response: if line.choices[0].delta["content"] is not None: diff --git a/application/llm/sagemaker.py b/application/llm/sagemaker.py index b81f638..b531020 100644 --- a/application/llm/sagemaker.py +++ b/application/llm/sagemaker.py @@ -4,11 +4,10 @@ import json import io - class LineIterator: """ - A helper class for parsing the byte stream input. - + A helper class for parsing the byte stream input. + The output of the model will be in the following format: ``` b'{"outputs": [" a"]}\n' @@ -16,21 +15,21 @@ class LineIterator: b'{"outputs": [" problem"]}\n' ... ``` - - While usually each PayloadPart event from the event stream will contain a byte array + + While usually each PayloadPart event from the event stream will contain a byte array with a full json, this is not guaranteed and some of the json objects may be split across PayloadPart events. For example: ``` {'PayloadPart': {'Bytes': b'{"outputs": '}} {'PayloadPart': {'Bytes': b'[" problem"]}\n'}} ``` - + This class accounts for this by concatenating bytes written via the 'write' function and then exposing a method which will return lines (ending with a '\n' character) within - the buffer via the 'scan_lines' function. It maintains the position of the last read - position to ensure that previous bytes are not exposed again. + the buffer via the 'scan_lines' function. It maintains the position of the last read + position to ensure that previous bytes are not exposed again. """ - + def __init__(self, stream): self.byte_iterator = iter(stream) self.buffer = io.BytesIO() @@ -43,7 +42,7 @@ class LineIterator: while True: self.buffer.seek(self.read_pos) line = self.buffer.readline() - if line and line[-1] == ord('\n'): + if line and line[-1] == ord("\n"): self.read_pos += len(line) return line[:-1] try: @@ -52,33 +51,32 @@ class LineIterator: if self.read_pos < self.buffer.getbuffer().nbytes: continue raise - if 'PayloadPart' not in chunk: - print('Unknown event type:' + chunk) + if "PayloadPart" not in chunk: + print("Unknown event type:" + chunk) continue self.buffer.seek(0, io.SEEK_END) - self.buffer.write(chunk['PayloadPart']['Bytes']) + self.buffer.write(chunk["PayloadPart"]["Bytes"]) + class SagemakerAPILLM(BaseLLM): def __init__(self, *args, **kwargs): import boto3 + runtime = boto3.client( - 'runtime.sagemaker', - aws_access_key_id='xxx', - aws_secret_access_key='xxx', - region_name='us-west-2' + "runtime.sagemaker", + aws_access_key_id="xxx", + aws_secret_access_key="xxx", + region_name="us-west-2", ) - - self.endpoint = settings.SAGEMAKER_ENDPOINT + self.endpoint = settings.SAGEMAKER_ENDPOINT self.runtime = runtime - - def gen(self, model, messages, stream=False, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + def _raw_gen(self, model, messages, stream=False, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" - # Construct payload for endpoint payload = { @@ -89,25 +87,25 @@ class SagemakerAPILLM(BaseLLM): "temperature": 0.1, "max_new_tokens": 30, "repetition_penalty": 1.03, - "stop": ["", "###"] - } + "stop": ["", "###"], + }, } - body_bytes = json.dumps(payload).encode('utf-8') + body_bytes = json.dumps(payload).encode("utf-8") # Invoke the endpoint - response = self.runtime.invoke_endpoint(EndpointName=self.endpoint, - ContentType='application/json', - Body=body_bytes) - result = json.loads(response['Body'].read().decode()) + response = self.runtime.invoke_endpoint( + EndpointName=self.endpoint, ContentType="application/json", Body=body_bytes + ) + result = json.loads(response["Body"].read().decode()) import sys - print(result[0]['generated_text'], file=sys.stderr) - return result[0]['generated_text'][len(prompt):] - def gen_stream(self, model, messages, stream=True, **kwargs): - context = messages[0]['content'] - user_question = messages[-1]['content'] + print(result[0]["generated_text"], file=sys.stderr) + return result[0]["generated_text"][len(prompt) :] + + def _raw_gen_stream(self, model, messages, stream=True, **kwargs): + context = messages[0]["content"] + user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" - # Construct payload for endpoint payload = { @@ -118,22 +116,22 @@ class SagemakerAPILLM(BaseLLM): "temperature": 0.1, "max_new_tokens": 512, "repetition_penalty": 1.03, - "stop": ["", "###"] - } + "stop": ["", "###"], + }, } - body_bytes = json.dumps(payload).encode('utf-8') + body_bytes = json.dumps(payload).encode("utf-8") # Invoke the endpoint - response = self.runtime.invoke_endpoint_with_response_stream(EndpointName=self.endpoint, - ContentType='application/json', - Body=body_bytes) - #result = json.loads(response['Body'].read().decode()) - event_stream = response['Body'] - start_json = b'{' + response = self.runtime.invoke_endpoint_with_response_stream( + EndpointName=self.endpoint, ContentType="application/json", Body=body_bytes + ) + # result = json.loads(response['Body'].read().decode()) + event_stream = response["Body"] + start_json = b"{" for line in LineIterator(event_stream): - if line != b'' and start_json in line: - #print(line) - data = json.loads(line[line.find(start_json):].decode('utf-8')) - if data['token']['text'] not in ["", "###"]: - print(data['token']['text'],end='') - yield data['token']['text'] \ No newline at end of file + if line != b"" and start_json in line: + # print(line) + data = json.loads(line[line.find(start_json) :].decode("utf-8")) + if data["token"]["text"] not in ["", "###"]: + print(data["token"]["text"], end="") + yield data["token"]["text"]