From 84c1bbe8833d8a5530923e70ed2787445167e8ba Mon Sep 17 00:00:00 2001
From: Atinoda <61033436+Atinoda@users.noreply.github.com>
Date: Fri, 23 Jun 2023 15:00:43 +0100
Subject: [PATCH] Integrate `llama-cublas` into base image

- Update README.md with deprecation warnings and improved descriptions.
- Fix the build error in the latest `llama-cpp-python` version by removing CMAKE directives.
---
 Dockerfile |  4 ++--
 README.md  | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index daa1149..caade7b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,6 +20,7 @@ RUN git clone https://github.com/oobabooga/text-generation-webui /src
 #ARG LCL_SRC_DIR="text-generation-webui"
 #COPY ${LCL_SRC_DIR} /src
 #################################
+ENV LLAMA_CUBLAS=1
 # Copy source to app
 RUN cp -ar /src /app
 # Install oobabooga/text-generation-webui
@@ -94,8 +95,7 @@ FROM base AS llama-cublas
 RUN echo "LLAMA-CUBLAS" >> /variant.txt
 RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip
 ENV LLAMA_CUBLAS=1
-RUN pip uninstall -y llama-cpp-python && \
-    CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+RUN pip uninstall -y llama-cpp-python && pip install llama-cpp-python
 ENV EXTRA_LAUNCH_ARGS=""
 CMD ["python3", "/app/server.py"]
 
diff --git a/README.md b/README.md
index 3972722..c94f2d1 100644
--- a/README.md
+++ b/README.md
@@ -12,20 +12,20 @@ This project dockerises the deployment of [oobabooga/text-generation-webui](http
 - CUDA docker runtime
 
 ## Docker Compose
-This is the recommended deployment method.
+This is the recommended deployment method (it is the easiest and quickest way to manage folders and settings through updates and reinstalls). The recommend variant is `default` (it is an enhanced version of the vanilla application).
 
 ### Select variant
-Choose the desired variant by setting the image `:tag` in `docker-compose.yml` to one of the following options:
+Each variant has the 'extras' incuded in `default` but has some changes made as described in the table. Choose the desired variant by setting the image `:tag` in `docker-compose.yml` to one of the following options:
 
 | Variant | Description | 
 |---|---|
-| `default` | Implementation of the vanilla deployment from source. Also includes pre-installed `ExLlAMA` library from `turboderp/exllama`.  |
-| `triton` | Updated `GPTQ-for-llama` using the latest `triton` branch from `qwopqwop200/GPTQ-for-LLaMa`. Suitable for Linux only. |
-| `cuda` | Updated `GPTQ-for-llama` using the latest `cuda` branch from `qwopqwop200/GPTQ-for-LLaMa`. |
-| `monkey-patch` | Use LoRAs in 4-Bit `GPTQ-for-llama` mode. |
-| `llama-cublas` | CUDA GPU offloading enabled for `llama-cpp`. Use by setting option `n-gpu-layers` > 0. |
+| `default` | Implementation of the vanilla deployment from source. Plus pre-installed `ExLlAMA` library from `turboderp/exllama`, and CUDA GPU offloading enabled for `llama-cpp`. *This version is recommended for most users.*  |
+| `triton` | Updated `GPTQ-for-llama` using the latest `triton` branch from `qwopqwop200/GPTQ-for-LLaMa`. Suitable for Linux only. *This version is accurate but a little slow.* |
+| `cuda` | Updated `GPTQ-for-llama` using the latest `cuda` branch from `qwopqwop200/GPTQ-for-LLaMa`. *This version is very slow!* |
+| `monkey-patch` | Use LoRAs in 4-Bit `GPTQ-for-llama` mode. ***DEPRECATION WARNING:** This version is outdated, but will remain for now.* |
+| `llama-cublas` | CUDA GPU offloading enabled for `llama-cpp`. Use by setting option `n-gpu-layers` > 0. ***DEPRECATION WARNING:** This capability has been rolled into the default. The variant will be removed if the upstream dependency does not conflict with `default`.* |
 
-*See: [oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md) and [obabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md) for more information on variants.*
+*See: [oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md), [obabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md), and [oobabooga/text-generation-webui/blob/main/docs/ExLlama.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md) for more information on variants.*
 
 ### Deploy
 Deploy the service: