From faab710cbc3dee532b0eaa042b3f0122332e0e6a Mon Sep 17 00:00:00 2001
From: Atinoda <61033436+Atinoda@users.noreply.github.com>
Date: Mon, 18 Sep 2023 21:15:05 +0100
Subject: [PATCH] Add Exllamav2 to base image

---
 Dockerfile | 2 ++
 README.md  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 128b4d2..bd5e855 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,6 +40,8 @@ RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda /app/repos
 # Build and install default GPTQ ('quant_cuda')
 ARG TORCH_CUDA_ARCH_LIST="6.1;7.0;7.5;8.0;8.6+PTX"
 RUN cd /app/repositories/GPTQ-for-LLaMa/ && python3 setup_cuda.py install
+# Install exllamav2 and flash attention
+RUN pip install -U ninja exllamav2 && pip install flash-attn --no-build-isolation
 
 FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS base
 # Runtime pre-reqs
diff --git a/README.md b/README.md
index 06a0436..2437915 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Each variant has the 'extras' incuded in `default` but has some changes made as
 
 | Variant | Description | 
 |---|---|
-| `default` | Implementation of the vanilla deployment from source. Plus pre-installed `ExLlAMA` library from `turboderp/exllama`, and CUDA GPU offloading enabled for `llama-cpp`. *This version is recommended for most users.*  |
+| `default` | Implementation of the vanilla deployment from source. Plus pre-installed `ExLlAMAV2` library from `turboderp/exllamav2`, and CUDA GPU offloading enabled for `llama-cpp`. *This version is recommended for most users.*  |
 | `triton` | Updated `GPTQ-for-llama` using the latest `triton` branch from `qwopqwop200/GPTQ-for-LLaMa`. Suitable for Linux only. *This version is accurate but a little slow.* |
 | `cuda` | Updated `GPTQ-for-llama` using the latest `cuda` branch from `qwopqwop200/GPTQ-for-LLaMa`. *This version is very slow!* |
 | `llama-cpu` | GPU supported is REMOVED from `llama-cpp`. Suitable for systems without a CUDA-capable GPU. *This is only for when GPU acceleration is not available and is a slower way to run models!* |