From 84c1bbe8833d8a5530923e70ed2787445167e8ba Mon Sep 17 00:00:00 2001 From: Atinoda <61033436+Atinoda@users.noreply.github.com> Date: Fri, 23 Jun 2023 15:00:43 +0100 Subject: [PATCH] Integrate `llama-cublas` into base image - Update README.md with deprecation warnings and improved descriptions. - Fix the build error in the latest `llama-cpp-python` version by removing CMAKE directives. --- Dockerfile | 4 ++-- README.md | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index daa1149..caade7b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,7 @@ RUN git clone https://github.com/oobabooga/text-generation-webui /src #ARG LCL_SRC_DIR="text-generation-webui" #COPY ${LCL_SRC_DIR} /src ################################# +ENV LLAMA_CUBLAS=1 # Copy source to app RUN cp -ar /src /app # Install oobabooga/text-generation-webui @@ -94,8 +95,7 @@ FROM base AS llama-cublas RUN echo "LLAMA-CUBLAS" >> /variant.txt RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip ENV LLAMA_CUBLAS=1 -RUN pip uninstall -y llama-cpp-python && \ - CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +RUN pip uninstall -y llama-cpp-python && pip install llama-cpp-python ENV EXTRA_LAUNCH_ARGS="" CMD ["python3", "/app/server.py"] diff --git a/README.md b/README.md index 3972722..c94f2d1 100644 --- a/README.md +++ b/README.md @@ -12,20 +12,20 @@ This project dockerises the deployment of [oobabooga/text-generation-webui](http - CUDA docker runtime ## Docker Compose -This is the recommended deployment method. +This is the recommended deployment method (it is the easiest and quickest way to manage folders and settings through updates and reinstalls). The recommend variant is `default` (it is an enhanced version of the vanilla application). ### Select variant -Choose the desired variant by setting the image `:tag` in `docker-compose.yml` to one of the following options: +Each variant has the 'extras' incuded in `default` but has some changes made as described in the table. Choose the desired variant by setting the image `:tag` in `docker-compose.yml` to one of the following options: | Variant | Description | |---|---| -| `default` | Implementation of the vanilla deployment from source. Also includes pre-installed `ExLlAMA` library from `turboderp/exllama`. | -| `triton` | Updated `GPTQ-for-llama` using the latest `triton` branch from `qwopqwop200/GPTQ-for-LLaMa`. Suitable for Linux only. | -| `cuda` | Updated `GPTQ-for-llama` using the latest `cuda` branch from `qwopqwop200/GPTQ-for-LLaMa`. | -| `monkey-patch` | Use LoRAs in 4-Bit `GPTQ-for-llama` mode. | -| `llama-cublas` | CUDA GPU offloading enabled for `llama-cpp`. Use by setting option `n-gpu-layers` > 0. | +| `default` | Implementation of the vanilla deployment from source. Plus pre-installed `ExLlAMA` library from `turboderp/exllama`, and CUDA GPU offloading enabled for `llama-cpp`. *This version is recommended for most users.* | +| `triton` | Updated `GPTQ-for-llama` using the latest `triton` branch from `qwopqwop200/GPTQ-for-LLaMa`. Suitable for Linux only. *This version is accurate but a little slow.* | +| `cuda` | Updated `GPTQ-for-llama` using the latest `cuda` branch from `qwopqwop200/GPTQ-for-LLaMa`. *This version is very slow!* | +| `monkey-patch` | Use LoRAs in 4-Bit `GPTQ-for-llama` mode. ***DEPRECATION WARNING:** This version is outdated, but will remain for now.* | +| `llama-cublas` | CUDA GPU offloading enabled for `llama-cpp`. Use by setting option `n-gpu-layers` > 0. ***DEPRECATION WARNING:** This capability has been rolled into the default. The variant will be removed if the upstream dependency does not conflict with `default`.* | -*See: [oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md) and [obabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md) for more information on variants.* +*See: [oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md), [obabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md), and [oobabooga/text-generation-webui/blob/main/docs/ExLlama.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md) for more information on variants.* ### Deploy Deploy the service: