Request cache refactor (#37)

* [wip] cache import clean up and removed api imports * [chore] add timeout to requests * [chore] toma heartbeats to logger
2 years ago · c5f00b9f73
parent 889837f013
commit c5f00b9f73
18 changed files with 185 additions and 172 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 dev:
-	pip install -e .[dev]
+	pip install -e .[all]
 	pre-commit install

 test: dev check
--- a/README.md
+++ b/README.md
@ -19,7 +19,12 @@ cd manifest

 Install:
 ```bash
-pip install -e .
+pip install manifest
+```
+
+Install with HuggingFace API Support:
+```bash
+pip install manifest[api]
 ```

 Dev Install:
@ -28,7 +33,7 @@ make dev
 ```

 # Getting Started
-Running is simple to get started. If using OpenAI, set `export OPENAI_API_KEY=<OPENAIKEY>` then run
+Running is simple to get started. If using OpenAI, set `export OPENAI_API_KEY=<OPENAIKEY>` (or pass key in through variable `client_connection`) then run

 ```python
 from manifest import Manifest
@ -48,13 +53,22 @@ Manifest is meant to be a very light weight package to help with prompt design a
 * Supports caching of model inputs/outputs for iteration, reproducibility, and cost saving

 ## Models
-Manifest provides model clients for OpenAI, AI21, OPT (assuming model is loaded locally), and HuggingFace (see [below](#huggingface-models) for how to use locally hosted HuggingFace models). You can toggle between the models by changing `client_name` and `client_connection`. For example, if a HuggingFace model is loaded locally, run
+Manifest provides model clients for [OpenAI](https://openai.com/), [AI21](https://studio.ai21.com/), [Cohere](https://cohere.ai/), [Together](https://together.xyz/), and HuggingFace (see [below](#huggingface-models) for how to use locally hosted HuggingFace models). You can toggle between the models by changing `client_name` and `client_connection`. For example, if a HuggingFace model is loaded locally, run
 ```python
 manifest = Manifest(
    client_name = "huggingface",
    client_connection = "http://127.0.0.1:5000",
 )
 ```
+If you want to use Cohere, run
+```python
+manifest = Manifest(
+    client_name = "cohere",
+    client_connection = <COHERE_API_KEY>,
+)
+```
+You can also just set `export COHERE_API_KEY=<COHERE_API_KEY>` and not use `client_connection`.
+

 You can see the model details and possible model inputs to `run()` via
 ```python
@ -160,7 +174,7 @@ To use a HuggingFace generative model, in `manifest/api` we have a Falsk applica

 In a separate terminal or Tmux/Screen session, to load 6B parameters models, run
 ```bash
-python3 manifest/api/app.py \
+python3 -m manifest.api.app \
    --model_type huggingface \
    --model_name_or_path EleutherAI/gpt-j-6B \
    --device 0
@ -180,7 +194,7 @@ To help load larger models, we also support using `parallelize()` from HF, [acce

 * T0pp
 ```bash
-python3 manifest/api/app.py \
+python3 -m manifest.api.app \
    --model_type huggingface \
    --model_name_or_path bigscience/T0pp \
    --use_hf_parallelize
@ -188,7 +202,7 @@ python3 manifest/api/app.py \

 * NeoX 20B (requires at least 60GB of GPU memory)
 ```bash
-python3 manifest/api/app.py \
+python3 -m manifest.api.app \
    --model_type huggingface \
    --model_name_or_path EleutherAI/gpt-neox-20b \
    --use_accelerate_multigpu \
@ -196,7 +210,7 @@ python3 manifest/api/app.py \
 ```
 * Bloom 175B (requires at least 240GB of GPU memory)
 ```bash
-python3 manifest/api/app.py \
+python3 -m manifest.api.app \
    --model_type huggingface \
    --model_name_or_path bigscience/bloom \
    --use_bitsandbytes \
--- a/manifest/caches/cache.py
+++ b/manifest/caches/cache.py
@ -3,17 +3,7 @@ import json
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Union

-from manifest.response import CohereResponse, Response
-
-RESPONSE_CONSTRUCTORS = {
-    "openai": Response,
-    "cohere": CohereResponse,
-    "ai21": Response,
-    "huggingface": Response,
-    "opt": Response,
-    "dummy": Response,
-    "zoo": Response,
-}
+from manifest.response import Response


 def request_to_key(request: Dict) -> str:
@ -147,8 +137,4 @@ class Cache(ABC):
            response = compute()
            self.set_key(key, response_to_key(response))
            cached = False
-        client_name = compute.__module__.split(".")[-1]
-        if client_name in RESPONSE_CONSTRUCTORS:
-            return RESPONSE_CONSTRUCTORS[client_name](response, cached, request)
-        else:
-            return Response(response, cached, request)
+        return Response(response, cached, request)
--- a/manifest/clients/init.py
+++ b/manifest/clients/init.py
@ -1,2 +1 @@
 """Client init."""
-from manifest.clients.client import Client
--- a/manifest/clients/ai21.py
+++ b/manifest/clients/ai21.py
@ -24,6 +24,7 @@ AI21_PARAMS = {
    "n": ("numResults", 1),
    "top_p": ("topP", 1.0),
    "stop_sequences": ("stopSequences", []),
+    "client_timeout": ("client_timeout", 60),  # seconds
 }


@ -101,15 +102,7 @@ class AI21Client(Client):
            "choices": [
                {
                    "text": item["data"]["text"],
-                    "logprobs": [
-                        {
-                            "token": tok["generatedToken"]["token"],
-                            "logprob": tok["generatedToken"]["logprob"],
-                            "start": tok["textRange"]["start"],
-                            "end": tok["textRange"]["end"],
-                        }
-                        for tok in item["data"]["tokens"]
-                    ],
+                    "logprobs": item["data"]["tokens"],
                }
                for item in response["completions"]
            ],
@ -130,6 +123,9 @@ class AI21Client(Client):
        """
        request_params = {"prompt": query}
        for key in AI21_PARAMS:
+            if key in ["client_timeout"]:
+                # These are not passed to the AI21 API
+                continue
            request_params[AI21_PARAMS[key][0]] = request_args.pop(
                key, getattr(self, key)
            )
@ -141,6 +137,19 @@ class AI21Client(Client):
                headers={"Authorization": f"Bearer {self.api_key}"},
                json=request_params,
            )
+            try:
+                res = requests.post(
+                    post_str,
+                    headers={"Authorization": f"Bearer {self.api_key}"},
+                    json=request_params,
+                    timeout=getattr(self, "client_timeout"),
+                )
+                res.raise_for_status()
+            except requests.Timeout as e:
+                logger.error("AI21 request timed out. Increase client_timeout.")
+                raise e
+            except requests.exceptions.HTTPError as e:
+                raise e
            return self.format_response(res.json())

        return _run_completion, request_params
--- a/manifest/clients/client.py
+++ b/manifest/clients/client.py
@ -52,7 +52,9 @@ class Client(ABC):
        raise NotImplementedError()

    @abstractmethod
-    def connect(self, connection_str: str, client_args: Dict[str, Any]) -> None:
+    def connect(
+        self, connection_str: Optional[str], client_args: Dict[str, Any]
+    ) -> None:
        """
        Connect to client.

--- a/manifest/clients/cohere.py
+++ b/manifest/clients/cohere.py
@ -1,22 +1,20 @@
 """Cohere client."""

-import json
 import logging
 import os
 from typing import Any, Callable, Dict, List, Optional, Tuple

-import cohere
+import requests

 from manifest.clients.client import Client

-logging.getLogger("cohere").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)

 COHERE_MODELS = {"small", "medium", "large", "xlarge"}

 # Params are defined in https://docs.cohere.ai/generate-reference
 COHERE_PARAMS = {
-    "model": ("model", "xlarge"),
+    "engine": ("model", "xlarge"),
    "max_tokens": ("max_tokens", 20),
    "temperature": ("temperature", 0.75),
    "num_generations": ("num_generations", 1),
@ -27,6 +25,7 @@ COHERE_PARAMS = {
    "stop_sequences": ("stop_sequences", []),
    "return_likelihoods": ("return_likelihoods", ""),
    "logit_bias": ("logit_bias", {}),
+    "client_timeout": ("client_timeout", 60),  # seconds
 }


@ -47,18 +46,18 @@ class CohereClient(Client):
            connection_str: connection string.
            client_args: client arguments.
        """
-        api_key = os.environ.get("COHERE_API_KEY", connection_str)
-        if api_key is None:
+        self.api_key = os.environ.get("COHERE_API_KEY", connection_str)
+        if self.api_key is None:
            raise ValueError(
                "Cohere API key not set. Set COHERE_API_KEY environment "
                "variable or pass through `connection_str`."
            )
-        self.co = cohere.Client(api_key)
+        self.host = "https://api.cohere.ai"
        for key in COHERE_PARAMS:
            setattr(self, key, client_args.pop(key, COHERE_PARAMS[key][1]))
-        if getattr(self, "model") not in COHERE_MODELS:
+        if getattr(self, "engine") not in COHERE_MODELS:
            raise ValueError(
-                f"Invalid model {getattr(self, 'model')}. Must be {COHERE_MODELS}."
+                f"Invalid engine {getattr(self, 'engine')}. Must be {COHERE_MODELS}."
            )

    def close(self) -> None:
@ -74,7 +73,7 @@ class CohereClient(Client):
        Returns:
            model params.
        """
-        return {"model_name": "model", "model": getattr(self, "model")}
+        return {"model_name": "cohere", "engine": getattr(self, "engine")}

    def get_model_inputs(self) -> List:
        """
@ -85,6 +84,29 @@ class CohereClient(Client):
        """
        return list(COHERE_PARAMS.keys())

+    def format_response(self, response: Dict) -> Dict[str, Any]:
+        """
+        Format response to dict.
+
+        Args:
+            response: response
+
+        Return:
+            response as dict
+        """
+        return {
+            "object": "text_completion",
+            "model": getattr(self, "engine"),
+            "choices": [
+                {
+                    "text": item["text"],
+                    "text_logprob": item.get("likelihood", None),
+                    "logprobs": item.get("token_likelihoods", None),
+                }
+                for item in response["generations"]
+            ],
+        }
+
    def get_request(
        self, query: str, request_args: Dict[str, Any] = {}
    ) -> Tuple[Callable[[], Dict], Dict]:
@ -100,23 +122,33 @@ class CohereClient(Client):
        """
        request_params = {"prompt": query}
        for key in COHERE_PARAMS:
+            if key in ["client_timeout"]:
+                continue
            request_params[COHERE_PARAMS[key][0]] = request_args.pop(
                key, getattr(self, key)
            )

-        def _run_generation() -> Dict:
+        def _run_completion() -> Dict:
+            post_str = self.host + "/generate"
            try:
-                response = self.co.generate(**request_params)
-                return json.loads(
-                    json.dumps(
-                        response, default=lambda o: getattr(o, "__dict__", str(o))
-                    )
+                res = requests.post(
+                    post_str,
+                    headers={
+                        "Authorization": f"Bearer {self.api_key}",
+                        "Cohere-Version": "2021-11-08",
+                    },
+                    json=request_params,
+                    timeout=getattr(self, "client_timeout"),
                )
-            except cohere.CohereError as e:
-                logger.error(e)
+                res.raise_for_status()
+            except requests.Timeout as e:
+                logger.error("Cohere request timed out. Increase client_timeout.")
+                raise e
+            except requests.exceptions.HTTPError as e:
                raise e
+            return self.format_response(res.json())

-        return _run_generation, request_params
+        return _run_completion, request_params

    def get_choice_logit_request(
        self, query: str, gold_choices: List[str], request_args: Dict[str, Any] = {}
--- a/manifest/clients/dummy.py
+++ b/manifest/clients/dummy.py
@ -2,7 +2,7 @@
 import logging
 from typing import Any, Callable, Dict, List, Optional, Tuple

-from manifest.clients import Client
+from manifest.clients.client import Client

 logger = logging.getLogger(__name__)

--- a/manifest/clients/huggingface.py
+++ b/manifest/clients/huggingface.py
@ -35,6 +35,8 @@ class HuggingFaceClient(Client):
            connection_str: connection string.
            client_args: client arguments.
        """
+        if not connection_str:
+            raise ValueError("Must provide connection string")
        self.host = connection_str.rstrip("/")
        for key in HF_PARAMS:
            setattr(self, key, client_args.pop(key, HF_PARAMS[key][1]))
--- a/manifest/clients/openai.py
+++ b/manifest/clients/openai.py
@ -1,14 +1,12 @@
 """OpenAI client."""
 import logging
 import os
-import time
 from typing import Any, Callable, Dict, List, Optional, Tuple

-import openai
+import requests

 from manifest.clients.client import Client

-logging.getLogger("openai").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)

 OPENAI_ENGINES = {
@ -24,7 +22,7 @@ OPENAI_ENGINES = {

 # User param -> (client param, default value)
 OPENAI_PARAMS = {
-    "engine": ("engine", "text-davinci-002"),
+    "engine": ("model", "text-davinci-002"),
    "temperature": ("temperature", 1.0),
    "max_tokens": ("max_tokens", 10),
    "n": ("n", 1),
@ -34,8 +32,7 @@ OPENAI_PARAMS = {
    "stop_sequence": ("stop", None),  # OpenAI doesn't like empty lists
    "presence_penalty": ("presence_penalty", 0.0),
    "frequency_penalty": ("frequency_penalty", 0.0),
-    "rate_limit_retry_timeout": ("rate_limit_retry_timeout", 30),  # seconds
-    "rate_limit_retry_attempts": ("rate_limit_retry_attempts", 3),
+    "client_timeout": ("client_timeout", 60),  # seconds
 }


@ -56,12 +53,13 @@ class OpenAIClient(Client):
            connection_str: connection string.
            client_args: client arguments.
        """
-        openai.api_key = os.environ.get("OPENAI_API_KEY", connection_str)
-        if openai.api_key is None:
+        self.api_key = os.environ.get("OPENAI_API_KEY", connection_str)
+        if self.api_key is None:
            raise ValueError(
                "OpenAI API key not set. Set OPENAI_API_KEY environment "
                "variable or pass through `connection_str`."
            )
+        self.host = "https://api.openai.com/v1"
        for key in OPENAI_PARAMS:
            setattr(self, key, client_args.pop(key, OPENAI_PARAMS[key][1]))
        if getattr(self, "engine") not in OPENAI_ENGINES:
@ -94,6 +92,20 @@ class OpenAIClient(Client):
        """
        return list(OPENAI_PARAMS.keys())

+    def format_response(self, response: Dict) -> Dict[str, Any]:
+        """
+        Format response to dict.
+
+        Args:
+            response: response
+
+        Return:
+            response as dict
+        """
+        if "choices" not in response:
+            raise ValueError(f"Invalid response: {response}")
+        return response
+
    def get_request(
        self, query: str, request_args: Dict[str, Any] = {}
    ) -> Tuple[Callable[[], Dict], Dict]:
@ -109,7 +121,7 @@ class OpenAIClient(Client):
        """
        request_params = {"prompt": query}
        for key in OPENAI_PARAMS:
-            if key in ["rate_limit_retry_timeout", "rate_limit_retry_attempts"]:
+            if key in ["client_timeout"]:
                # These are not passed to the OpenAI API
                continue
            request_params[OPENAI_PARAMS[key][0]] = request_args.pop(
@ -117,23 +129,21 @@ class OpenAIClient(Client):
            )

        def _run_completion() -> Dict:
-            num_attempts = getattr(self, "rate_limit_retry_attempts")
-            timeout = getattr(self, "rate_limit_retry_timeout")
-
-            for attempt in range(num_attempts):
-                try:
-                    return openai.Completion.create(**request_params)
-                except openai.error.RateLimitError as e:
-                    if attempt == num_attempts - 1:
-                        raise e
-                    logger.warning(
-                        f"OpenAI rate limit exceeded. Retrying in {timeout} seconds."
-                    )
-                    time.sleep(timeout)
-                except openai.error.OpenAIError as e:
-                    logger.error(e)
-                    raise e
-            return {}
+            post_str = self.host + "/completions"
+            try:
+                res = requests.post(
+                    post_str,
+                    headers={"Authorization": f"Bearer {self.api_key}"},
+                    json=request_params,
+                    timeout=getattr(self, "client_timeout"),
+                )
+                res.raise_for_status()
+            except requests.Timeout as e:
+                logger.error("OpenAI request timed out. Increase client_timeout.")
+                raise e
+            except requests.exceptions.HTTPError as e:
+                raise e
+            return self.format_response(res.json())

        return _run_completion, request_params

--- a/manifest/clients/opt.py
+++ b/manifest/clients/opt.py
@ -32,6 +32,8 @@ class OPTClient(Client):
            connection_str: connection string.
            client_args: client arguments.
        """
+        if not connection_str:
+            raise ValueError("Must provide connection string")
        self.host = connection_str.rstrip("/")
        for key in OPT_PARAMS:
            setattr(self, key, client_args.pop(key, OPT_PARAMS[key][1]))
@ -50,7 +52,7 @@ class OPTClient(Client):
        Returns:
            model params.
        """
-        return {"model_name": "opt"}
+        return {"model_name": "opt", "engine": "opt-175b"}

    def get_model_inputs(self) -> List:
        """
--- a/manifest/clients/toma.py
+++ b/manifest/clients/toma.py
@ -1,4 +1,5 @@
 """TOMA client."""
+import json
 import logging
 import os
 import time
@ -44,15 +45,13 @@ TOMA_PARAMS = {
    "logprobs": ("logprobs", 0),
    "prompt_embedding": ("prompt_embedding", False),
    "echo": ("echo", False),
-    "rate_limit_retry_timeout": ("rate_limit_retry_timeout", 120),  # seconds
+    "client_timeout": ("client_timeout", 120),  # seconds
 }


 class TOMAClient(Client):
    """TOMA client."""

-    pending_jobs: List = []
-
    def connect(
        self,
        connection_str: Optional[str] = None,
@ -83,7 +82,7 @@ class TOMAClient(Client):
            )
        model_heartbeats = self.get_model_heartbeats()
        model_heartbeat_threshold = 60
-        print("TOMA MODEL HEARTBEATS\n", model_heartbeats)
+        logger.info(f"TOMA model heartbeats\n {json.dumps(model_heartbeats)}")
        if (
            model_heartbeats[getattr(self, "engine")]["last_ping"]
            > model_heartbeat_threshold
@ -93,15 +92,17 @@ class TOMAClient(Client):
                f"{model_heartbeats[getattr(self, 'engine')]} seconds."
            )
        if model_heartbeats[getattr(self, "engine")]["expected_runtime"] > getattr(
-            self, "rate_limit_retry_timeout"
+            self, "client_timeout"
        ):
            logger.warning(
                f"Model {getattr(self, 'engine')} has expected runtime "
                f"{model_heartbeats[getattr(self, 'engine')]['expected_runtime']} "
-                f"and may take longer than {getattr(self, 'rate_limit_retry_timeout')} "
-                "seconds to respond. Increase rate_limit_retry_timeout "
+                f"and may take longer than {getattr(self, 'client_timeout')} "
+                "seconds to respond. Increase client_timeout "
                "to avoid timeout."
            )
+        self.pending_jobs: List = []
+        self.completed_jobs: List = []

    def close(self) -> None:
        """Close the client."""
@ -128,6 +129,17 @@ class TOMAClient(Client):
        """
        return list(TOMA_PARAMS.keys())

+    def get_last_job_id(self) -> Optional[str]:
+        """
+        Get last job id.
+
+        Returns:
+            last job id.
+        """
+        if len(self.completed_jobs) > 0:
+            return self.completed_jobs[-1]
+        return None
+
    def get_model_heartbeats(self) -> Dict[str, Dict]:
        """
        Get TOMA models and their last ping time.
@ -173,7 +185,7 @@ class TOMAClient(Client):
        """
        Get response from job id.

-        Will try up to `rate_limit_retry_timeout` seconds to get response.
+        Will try up to `client_timeout` seconds to get response.

        Args:
            job_id: job id
@ -222,7 +234,7 @@ class TOMAClient(Client):
            ),
        }
        for key in TOMA_PARAMS:
-            if key in ["rate_limit_retry_timeout"]:
+            if key in ["client_timeout"]:
                # These are not passed to the TOMA API
                continue
            request_params[TOMA_PARAMS[key][0]] = request_args.pop(
@ -230,7 +242,7 @@ class TOMAClient(Client):
            )

        retry_timeout = request_args.pop(
-            "rate_limit_retry_timeout", getattr(self, "rate_limit_retry_timeout")
+            "client_timeout", getattr(self, "client_timeout")
        )

        # num_returns is for image-model-inference
@ -251,11 +263,12 @@ class TOMAClient(Client):
                },
            ).json()
            job_id = res["id"]
-            print(f"job_id: {job_id}")
            # TODO: ideally just submit the jobs and then fetch results in parallel
            self.pending_jobs.append(job_id)
            job_id = self.pending_jobs.pop()
-            return self.get_response(job_id, retry_timeout)
+            final_res = self.get_response(job_id, retry_timeout)
+            self.completed_jobs.append(job_id)
+            return final_res

        return _run_completion, request_params

--- a/manifest/clients/zoo.py
+++ b/manifest/clients/zoo.py
@ -27,6 +27,8 @@ class ZooClient(Client):
            connection_str: connection string.
            client_args: client arguments.
        """
+        if not connection_str:
+            raise ValueError("Must provide connection string")
        self.host = connection_str.rstrip("/")
        for key in ZOO_PARAMS:
            setattr(self, key, client_args.pop(key, ZOO_PARAMS[key][1]))
--- a/manifest/manifest.py
+++ b/manifest/manifest.py
@ -87,8 +87,9 @@ class Manifest:
        self.client = CLIENT_CONSTRUCTORS[client_name](  # type: ignore
            client_connection, client_args=kwargs
        )
-        if session_id is not None:
+        if session_id:
            if session_id == "_default":
+                # Set session_id to None for Session random id
                session_id = None
            self.session = Session(session_id)
        else:
@ -109,6 +110,7 @@ class Manifest:
        input: Optional[Any] = None,
        gold_choices: Optional[List[str]] = None,
        overwrite_cache: bool = False,
+        run_id: Optional[str] = None,
        stop_token: Optional[str] = None,
        return_response: bool = False,
        **kwargs: Any,
@ -121,6 +123,7 @@ class Manifest:
            input: input to prompt.
            gold_choices: gold choices for max logit response (only HF models).
            overwrite_cache: whether to overwrite cache.
+            run_id: run id for cache to repeat same run.
            stop_token: stop token for prompt generation.
                        Default is self.stop_token.
                        "" for no stop token.
@ -150,6 +153,8 @@ class Manifest:
        cache_key["client_name"] = self.client_name
        # Make query prompt dependent
        cache_key["prompt"] = prompt_str
+        if run_id:
+            cache_key["run_id"] = run_id
        response_obj = self.cache.get(cache_key, overwrite_cache, possible_request)
        # Log session dictionary values
        if self.session:
@ -166,6 +171,7 @@ class Manifest:
        input: Optional[Iterable[Any]] = None,
        gold_choices: Optional[List[str]] = None,
        overwrite_cache: bool = False,
+        run_id: Optional[str] = None,
        stop_token: Optional[str] = None,
        return_response: bool = False,
        verbose: bool = False,
@ -178,6 +184,7 @@ class Manifest:
            prompt: prompt to run.
            input: batch of inputs.
            gold_choices: gold choices for max logit response (only HF models).
+            run_id: run id for cache to repeat same run.
            overwrite_cache: whether to overwrite cache.
            stop_token: stop token for prompt generation.
                        Default is self.stop_token.
@ -199,6 +206,7 @@ class Manifest:
                inp,
                gold_choices,
                overwrite_cache,
+                run_id,
                stop_token,
                return_response,
                **kwargs,
--- a/manifest/prompt.py
+++ b/manifest/prompt.py
@ -56,7 +56,7 @@ class Prompt:
        Returns:
            prompt as str.
        """
-        return dill.dumps(self.prompt_func)
+        return dill.dumps(self.prompt_func).decode("latin1")

    @classmethod
    def deserialize(cls, obj: str) -> "Prompt":
@ -69,4 +69,4 @@ class Prompt:
        Return:
            prompt.
        """
-        return Prompt(dill.loads(obj))
+        return Prompt(dill.loads(obj.encode("latin1")))
--- a/manifest/response.py
+++ b/manifest/response.py
@ -25,15 +25,6 @@ class Response:
                    "Response must be serialized to a dict with a "
                    "list of choices with text field"
                )
-            if (
-                "logprobs" in self._response["choices"][0]
-                and self._response["choices"][0]["logprobs"]
-            ):
-                if not isinstance(self._response["choices"][0]["logprobs"], list):
-                    raise ValueError(
-                        "Response must be serialized to a dict with a "
-                        "list of choices with logprobs field"
-                    )
        self._cached = cached
        self._request_params = request_params

@ -49,7 +40,7 @@ class Response:
        """Get response dict without parsing."""
        return self._response

-    def get_response(self, stop_token: str = "") -> Union[str, List[str]]:
+    def get_response(self, stop_token: str = "") -> Union[str, List[str], None]:
        """
        Get all text results from response.

@ -139,59 +130,3 @@ class Response:
            string representation of response.
        """
        return str(self)
-
-
-class CohereResponse(Response):
-    """Response class for Cohere client."""
-
-    def __init__(self, response: Dict, cached: bool, request_params: Dict):
-        """Initialize response."""
-        if isinstance(response, dict):
-            self._response = response
-        else:
-            raise ValueError(f"Response must be str or dict. Response is\n{response}.")
-        if ("generations" not in self._response) or (
-            not isinstance(self._response["generations"], list)
-        ):
-            raise ValueError(
-                "Response must be serialized to a dict with a list of generations. "
-                f"Response is\n{self._response}."
-            )
-        if len(self._response["generations"]) > 0:
-            if "text" not in self._response["generations"][0]:
-                raise ValueError(
-                    "Response must be serialized to a dict with a "
-                    "list of generations with text field"
-                )
-            if (
-                "token_likelihoods" in self._response["generations"][0]
-                and self._response["generations"][0]["token_likelihoods"]
-            ):
-                if not isinstance(
-                    self._response["generations"][0]["token_likelihoods"], list
-                ):
-                    raise ValueError(
-                        "Response must be serialized to a dict with a "
-                        "list of generations with token_likelihoods field"
-                    )
-        self._cached = cached
-        self._request_params = request_params
-
-    def get_response(self, stop_token: str = "") -> Union[str, List[str]]:
-        """
-        Get all text results from response.
-
-        Args:
-            stop_token: stop token for string generation
-        """
-        process_result = (
-            lambda x: x.strip().split(stop_token)[0] if stop_token else x.strip()
-        )
-        if len(self._response["generations"]) == 0:
-            return None
-        if len(self._response["generations"]) == 1:
-            return process_result(self._response["generations"][0]["text"])
-        return [
-            process_result(generation["text"])
-            for generation in self._response["generations"]
-        ]
--- a/manifest/session.py
+++ b/manifest/session.py
@ -4,7 +4,7 @@ import os
 import sqlite3
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 from manifest.caches.cache import (
    key_to_request,
@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
 class Session:
    """A user session for caching requests."""

-    def __init__(self, session_id: str = None) -> None:
+    def __init__(self, session_id: Optional[str] = None) -> None:
        """
        Initialize session.

--- a/setup.py
+++ b/setup.py
@ -28,22 +28,21 @@ VERSION = main_ns["__version__"]

 # What packages are required for this module to be executed?
 REQUIRED = [
-    "Flask>=2.1.2",
-    "accelerate>=0.10.0",
-    "cohere>=2.5.0",
    "dill>=0.3.5",
-    "openai>=0.18.1",
    "redis>=4.3.1",
    "requests>=2.27.1",
    "sqlitedict>=2.0.0",
    "tqdm>=4.64.0",
-    "transformers>=4.20.0",
-    "uuid>=1.30",
-    "torch>=1.8.0",
 ]

 # What packages are optional?
 EXTRAS = {
+    "api": [
+        "Flask>=2.1.2",
+        "accelerate>=0.10.0",
+        "transformers>=4.20.0",
+        "torch>=1.8.0",
+    ],
    "dev": [
        "autopep8>=1.6.0",
        "black>=22.3.0",