deploying new version with streaming

pull/629/head
Richard Guo 1 year ago
parent bce2b3025b
commit 057b9f51bc

@ -170,25 +170,25 @@ workflows:
branches:
only:
- main
# build-py-deploy:
# jobs:
# - build-py-linux:
# filters:
# branches:
# only:
# - build-py-macos:
# filters:
# branches:
# only:
# - build-py-windows:
# filters:
# branches:
# only:
# - store-and-upload-wheels:
# filters:
# branches:
# only:
# requires:
# - build-py-windows
# - build-py-linux
# - build-py-macos
build-py-deploy:
jobs:
- build-py-linux:
filters:
branches:
only:
- build-py-macos:
filters:
branches:
only:
- build-py-windows:
filters:
branches:
only:
- store-and-upload-wheels:
filters:
branches:
only:
requires:
- build-py-windows
- build-py-linux
- build-py-macos

@ -155,24 +155,26 @@ class GPT4All():
print("Model downloaded at: " + download_path)
return download_path
def generate(self, prompt: str, **generate_kwargs) -> str:
def generate(self, prompt: str, streaming: bool = False, **generate_kwargs) -> str:
"""
Surfaced method of running generate without accessing model object.
Args:
prompt: Raw string to be passed to model.
streaming: True if want output streamed to stdout.
**generate_kwargs: Optional kwargs to pass to prompt context.
Returns:
Raw string of generated model response.
"""
return self.model.generate(prompt, **generate_kwargs)
return self.model.generate(prompt, streaming=streaming, **generate_kwargs)
def chat_completion(self,
messages: List[Dict],
default_prompt_header: bool = True,
default_prompt_footer: bool = True,
verbose: bool = True,
streaming: bool = True,
**generate_kwargs) -> str:
"""
Format list of message dictionaries into a prompt and call model
@ -189,6 +191,7 @@ class GPT4All():
before user/assistant role messages.
default_prompt_footer: If True (default), add default footer at end of prompt.
verbose: If True (default), print full prompt and generated response.
streaming: True if want output streamed to stdout.
**generate_kwargs: Optional kwargs to pass to prompt context.
Returns:
@ -206,7 +209,7 @@ class GPT4All():
if verbose:
print(full_prompt)
response = self.model.generate(full_prompt, **generate_kwargs)
response = self.model.generate(full_prompt, streaming=streaming, **generate_kwargs)
if verbose:
print(response)

@ -1,25 +1,23 @@
from io import StringIO
import pkg_resources
import ctypes
import os
import platform
import re
import subprocess
import sys
class DualOutput:
def __init__(self, stdout, string_io):
self.stdout = stdout
self.string_io = string_io
class DualStreamProcessor:
def __init__(self, stream=None):
self.stream = stream
self.output = ""
def write(self, text):
self.stdout.write(text)
self.string_io.write(text)
cleaned_text = re.sub(r"\n(?!\n)", "", text)
if self.stream is not None:
self.stream.write(cleaned_text)
self.stream.flush()
self.output += cleaned_text
def flush(self):
# It's a good idea to also define a flush method that flushes both
# outputs, as sys.stdout is expected to have this method.
self.stdout.flush()
self.string_io.flush()
# TODO: provide a config file to make this more robust
LLMODEL_PATH = os.path.join("llmodel_DO_NOT_MODIFY", "build").replace("\\", "\\\\")
@ -175,7 +173,7 @@ class LLModel:
repeat_penalty: float = 1.2,
repeat_last_n: int = 10,
context_erase: float = .5,
std_passthrough: bool = False) -> str:
streaming: bool = False) -> str:
"""
Generate response from model from a prompt.
@ -183,12 +181,8 @@ class LLModel:
----------
prompt: str
Question, task, or conversation for model to respond to
add_default_header: bool, optional
Whether to add a prompt header (default is True)
add_default_footer: bool, optional
Whether to add a prompt footer (default is True)
verbose: bool, optional
Whether to print prompt and response
streaming: bool
Stream response to stdout
Returns
-------
@ -198,13 +192,14 @@ class LLModel:
prompt = prompt.encode('utf-8')
prompt = ctypes.c_char_p(prompt)
# Change stdout to StringIO so we can collect response
old_stdout = sys.stdout
collect_response = StringIO()
if std_passthrough:
sys.stdout = DualOutput(old_stdout, collect_response)
else:
sys.stdout = collect_response
stream_processor = DualStreamProcessor()
if streaming:
stream_processor.stream = sys.stdout
sys.stdout = stream_processor
context = LLModelPromptContext(
logits_size=logits_size,
@ -227,14 +222,11 @@ class LLModel:
ResponseCallback(self._response_callback),
RecalculateCallback(self._recalculate_callback),
context)
response = collect_response.getvalue()
# Revert to old stdout
sys.stdout = old_stdout
# Remove the unnecessary new lines from response
response = re.sub(r"\n(?!\n)", "", response).strip()
return response
return stream_processor.output
# Empty prompt callback
@staticmethod

@ -78,6 +78,8 @@ setup(
'dev': [
'pytest',
'twine',
'wheel',
'setuptools',
'mkdocs-material',
'mkautodoc',
'mkdocstrings[python]',

Loading…
Cancel
Save