[feature] added galactica models, improved response error handling (#41)

laurel/helm
Laurel Orr 2 years ago committed by GitHub
parent e86b8e81bf
commit a9d8bdd4dc

@ -1,5 +1,6 @@
dev: deepspeed
dev:
pip install -e .[all]
pip install -e git+https://github.com/microsoft/DeepSpeed.git#egg=deepspeed
pre-commit install
test: dev check

@ -1,15 +1,16 @@
"""Flask app."""
import argparse
import json
import logging
import os
import socket
from typing import Dict
import pkg_resources
from flask import Flask, request
from flask import Flask, Response, request
from manifest.api.models.huggingface import HuggingFaceModel
from manifest.api.response import Response
from manifest.api.response import ModelResponse
os.environ["TOKENIZERS_PARALLELISM"] = "false"
@ -40,12 +41,6 @@ def parse_args() -> argparse.Namespace:
type=str,
help="Name of model or path to model. Used in initialize of model class.",
)
parser.add_argument(
"--model_config",
default=None,
type=str,
help="Model config. Used in initialize of model class.",
)
parser.add_argument(
"--cache_dir", default=None, type=str, help="Cache directory for models."
)
@ -105,9 +100,8 @@ def main() -> None:
model_type = kwargs.model_type
model_name_or_path = kwargs.model_name_or_path
model_config = kwargs.model_config
if not model_name_or_path and not model_config:
raise ValueError("Must provide model_name_or_path or model_config.")
if not model_name_or_path:
raise ValueError("Must provide model_name_or_path.")
if kwargs.use_accelerate_multigpu:
logger.info("Using accelerate. Overridding --device argument.")
if (
@ -134,7 +128,6 @@ def main() -> None:
global model
model = MODEL_CONSTRUCTORS[model_type](
model_name_or_path,
model_config=model_config,
cache_dir=kwargs.cache_dir,
device=kwargs.device,
use_accelerate=kwargs.use_accelerate_multigpu,
@ -148,7 +141,7 @@ def main() -> None:
@app.route("/completions", methods=["POST"])
def completions() -> Dict:
def completions() -> Response:
"""Get completions for generation."""
prompt = request.json["prompt"]
del request.json["prompt"]
@ -156,17 +149,28 @@ def completions() -> Dict:
if not isinstance(prompt, (str, list)):
raise ValueError("Prompt must be a str or list of str")
results_text = []
for generations in model.generate(prompt, **generation_args):
results_text.append(generations)
results = [{"text": r[0], "text_logprob": r[1]} for r in results_text]
# transform the result into the openai format
return Response(results, response_type="text_completion").__dict__()
try:
results_text = []
for generations in model.generate(prompt, **generation_args):
results_text.append(generations)
results = [{"text": r[0], "text_logprob": r[1]} for r in results_text]
# transform the result into the openai format
return Response(
json.dumps(
ModelResponse(results, response_type="text_completion").__dict__()
),
status=200,
)
except Exception as e:
logger.error(e)
return Response(
json.dumps({"message": str(e)}),
status=400,
)
@app.route("/choice_logits", methods=["POST"])
def choice_logits() -> Dict:
def choice_logits() -> Response:
"""Get maximal likely choice via max logits after generation."""
prompt = request.json["prompt"]
del request.json["prompt"]
@ -179,11 +183,24 @@ def choice_logits() -> Dict:
if not isinstance(gold_choices, list):
raise ValueError("Gold choices must be a list of string choices")
choice_score_list = model.logits_scoring(prompt, gold_choices, **generation_args)
results = [{"text": r[0], "text_logprob": r[1]} for r in choice_score_list]
# transform the result into the openai format
return Response(results, response_type="choice_selection").__dict__()
try:
choice_score_list = model.logits_scoring(
prompt, gold_choices, **generation_args
)
results = [{"text": r[0], "text_logprob": r[1]} for r in choice_score_list]
# transform the result into the openai format
return Response(
json.dumps(
ModelResponse(results, response_type="choice_selection").__dict__()
),
status=200,
)
except Exception as e:
logger.error(e)
return Response(
json.dumps({"message": str(e)}),
status=400,
)
@app.route("/params", methods=["POST"])

@ -29,15 +29,20 @@ MODEL_REGISTRY = {
"EleutherAI/gpt-neo-2.7B": GPTNeoForCausalLM,
"EleutherAI/gpt-j-6B": GPTJForCausalLM,
"EleutherAI/gpt-neox-20b": GPTNeoXForCausalLM,
"facebook/opt-125m": OPTForCausalLM,
"facebook/opt-350m": OPTForCausalLM,
"Salesforce/codegen-2B-mono": AutoModelForCausalLM,
"Salesforce/codegen-6B-mono": AutoModelForCausalLM,
"facebook/opt-125m": OPTForCausalLM,
"facebook/opt-350m": OPTForCausalLM,
"facebook/opt-1.3b": OPTForCausalLM,
"facebook/opt-2.7b": OPTForCausalLM,
"facebook/opt-6.7b": OPTForCausalLM,
"facebook/opt-13b": OPTForCausalLM,
"facebook/opt-30b": OPTForCausalLM,
"facebook/galactica-125m": OPTForCausalLM,
"facebook/galactica-1.3b": OPTForCausalLM,
"facebook/galactica-6.7b": OPTForCausalLM,
"facebook/galactica-30b": OPTForCausalLM,
"facebook/galactica-120b": OPTForCausalLM,
"gpt2": GPT2LMHeadModel,
"bigscience/bloom-560m": BloomForCausalLM,
"bigscience/bloom-1b7": BloomForCausalLM,
@ -108,9 +113,7 @@ class Pipeline:
print(f"Usings max_length: {self.max_length}")
self.tokenizer = tokenizer
# self.device = device
# With bits and bytes, do not want to place inputs on any device
# if self.device:
self.device = (
torch.device("cpu")
if (device == -1 or not torch.cuda.is_available())
@ -144,7 +147,8 @@ class Pipeline:
)
encoded_prompt = encoded_prompt.to(self.device)
output_dict = self.model.generate( # type: ignore
**encoded_prompt,
input_ids=encoded_prompt.input_ids,
attention_mask=encoded_prompt.attention_mask,
max_new_tokens=kwargs.get("max_new_tokens"),
temperature=kwargs.get("temperature", None),
top_k=kwargs.get("top_k", None),
@ -182,7 +186,6 @@ class HuggingFaceModel(Model):
def __init__(
self,
model_name_or_path: str,
model_config: Optional[str] = None,
cache_dir: Optional[str] = None,
device: int = 0,
use_accelerate: bool = False,
@ -199,7 +202,6 @@ class HuggingFaceModel(Model):
Args:
model_name_or_path: model name string.
model_config: model config string.
cache_dir: cache directory for model.
device: device to use for model.
use_accelerate: whether to use accelerate for multi-gpu inference.
@ -262,7 +264,13 @@ class HuggingFaceModel(Model):
print(f"Loaded Model DType {model.dtype}")
self.is_encdec = model.config.is_encoder_decoder
if not self.is_encdec:
# Set pad tokens for galactic
if self.model_name.startswith("facebook/galactic"):
# https://github.com/paperswithcode/galai/blob/main/galai/model.py
tokenizer.pad_token = "[PAD]"
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id = 1
elif not self.is_encdec:
tokenizer.pad_token = tokenizer.eos_token
if not use_bitsandbytes:

@ -10,7 +10,6 @@ class Model(ABC):
def __init__(
self,
model_name_or_path: str,
model_config: str,
cache_dir: str,
device: int,
use_accelerate: bool,
@ -27,7 +26,6 @@ class Model(ABC):
Args:
model_name_or_path: model name string.
model_config: model config string.
cache_dir: cache directory for model.
device: device to use for model.
use_accelerate: whether to use accelerate for multi-gpu inference.

@ -5,8 +5,8 @@ import uuid
from typing import Any, Dict, List
class Response:
"""Response."""
class ModelResponse:
"""ModelResponse."""
def __init__(self, results: List[Dict[str, Any]], response_type: str) -> None:
"""Initialize response."""

@ -160,6 +160,7 @@ class Client(ABC):
)
raise e
except requests.exceptions.HTTPError as e:
logger.error(res.text)
raise e
return self.format_response(res.json())

@ -111,6 +111,7 @@ class HuggingFaceClient(Client):
logger.error("HF request timed out. Increase client_timeout.")
raise e
except requests.exceptions.HTTPError as e:
logger.error(res.text)
raise e
return res.json()

Loading…
Cancel
Save