text-generation-webui-docker/Dockerfile

FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS env_base
# Pre-reqs
RUN apt-get update && apt-get install --no-install-recommends -y \
    git vim build-essential python3-dev python3-venv python3-pip
# Instantiate venv and pre-activate
RUN pip3 install virtualenv
RUN virtualenv /venv
# Credit, Itamar Turner-Trauring: https://pythonspeed.com/articles/activate-virtualenv-dockerfile/
ENV VIRTUAL_ENV=/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip3 install --upgrade pip setuptools && \
    pip3 install torch torchvision torchaudio

FROM env_base AS app_base 
### DEVELOPERS/ADVANCED USERS ###
# Clone oobabooga/text-generation-webui
RUN git clone https://github.com/oobabooga/text-generation-webui /src
# To use local source: comment out the git clone command then set the build arg `LCL_SRC_DIR`
#ARG LCL_SRC_DIR="text-generation-webui"
#COPY ${LCL_SRC_DIR} /src
#################################
# Copy source to app
RUN cp -ar /src /app
# Install oobabooga/text-generation-webui
RUN --mount=type=cache,target=/root/.cache/pip pip3 install -r /app/requirements.txt
# Install extensions
COPY ./scripts/build_extensions.sh /scripts/build_extensions.sh
RUN --mount=type=cache,target=/root/.cache/pip \
    chmod +x /scripts/build_extensions.sh && . /scripts/build_extensions.sh
# Clone default GPTQ
RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda /app/repositories/GPTQ-for-LLaMa
# Build and install default GPTQ ('quant_cuda')
ARG TORCH_CUDA_ARCH_LIST="6.1;7.0;7.5;8.0;8.6+PTX"
RUN cd /app/repositories/GPTQ-for-LLaMa/ && python3 setup_cuda.py install
# Install auto-gptq
RUN cd /app/repositories/ && git clone https://github.com/PanQiWei/AutoGPTQ.git && \
    cd AutoGPTQ && pip3 install .


FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS base
# Runtime pre-reqs
RUN apt-get update && apt-get install --no-install-recommends -y \
    python3-venv python3-dev git
# Copy app and src
COPY --from=app_base /app /app
COPY --from=app_base /src /src
# Copy and activate venv
COPY --from=app_base /venv /venv
ENV VIRTUAL_ENV=/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# Finalise app setup
WORKDIR /app
EXPOSE 7860
EXPOSE 5000
EXPOSE 5005
# Required for Python print statements to appear in logs
ENV PYTHONUNBUFFERED=1  
# Run
COPY ./scripts/docker-entrypoint.sh /scripts/docker-entrypoint.sh
RUN chmod +x /scripts/docker-entrypoint.sh
ENTRYPOINT ["/scripts/docker-entrypoint.sh"]


# VARIANT BUILDS
FROM base AS cuda
RUN echo "CUDA" >> /variant.txt
RUN apt-get install --no-install-recommends -y git python3-dev python3-pip
RUN rm -rf /app/repositories/GPTQ-for-LLaMa && \
    git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b cuda /app/repositories/GPTQ-for-LLaMa
RUN pip3 uninstall -y quant-cuda && \
    pip3 install -r /app/repositories/GPTQ-for-LLaMa/requirements.txt
ENV EXTRA_LAUNCH_ARGS=""
CMD ["python3", "/app/server.py"]

FROM base AS triton
RUN echo "TRITON" >> /variant.txt
RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip
RUN rm -rf /app/repositories/GPTQ-for-LLaMa && \
    git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b triton /app/repositories/GPTQ-for-LLaMa
RUN pip3 uninstall -y quant-cuda && \
    pip3 install -r /app/repositories/GPTQ-for-LLaMa/requirements.txt
ENV EXTRA_LAUNCH_ARGS=""
CMD ["python3", "/app/server.py"]

FROM base AS llama-cublas
RUN echo "LLAMA-CUBLAS" >> /variant.txt
RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip
ENV LLAMA_CUBLAS=1
RUN pip uninstall -y llama-cpp-python && \
    CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
ENV EXTRA_LAUNCH_ARGS=""
CMD ["python3", "/app/server.py"]

FROM base AS monkey-patch
RUN echo "4-BIT MONKEY-PATCH" >> /variant.txt
RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip
RUN git clone https://github.com/johnsmith0031/alpaca_lora_4bit /app/repositories/alpaca_lora_4bit && \
    cd /app/repositories/alpaca_lora_4bit && git checkout 2f704b93c961bf202937b10aac9322b092afdce0
ARG TORCH_CUDA_ARCH_LIST="8.6"
RUN pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
ENV EXTRA_LAUNCH_ARGS=""
CMD ["python3", "/app/server.py", "--monkey-patch"]

FROM base AS default
RUN echo "DEFAULT" >> /variant.txt
ENV EXTRA_LAUNCH_ARGS=""
CMD ["python3", "/app/server.py"]
Initial commit 1 year ago			`FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS env_base`
			`# Pre-reqs`
			`RUN apt-get update && apt-get install --no-install-recommends -y \`
			`git vim build-essential python3-dev python3-venv python3-pip`
			`# Instantiate venv and pre-activate`
			`RUN pip3 install virtualenv`
			`RUN virtualenv /venv`
			`# Credit, Itamar Turner-Trauring: https://pythonspeed.com/articles/activate-virtualenv-dockerfile/`
			`ENV VIRTUAL_ENV=/venv`
			`RUN python3 -m venv $VIRTUAL_ENV`
			`ENV PATH="$VIRTUAL_ENV/bin:$PATH"`
			`RUN pip3 install --upgrade pip setuptools && \`
			`pip3 install torch torchvision torchaudio`

			`FROM env_base AS app_base`
			`### DEVELOPERS/ADVANCED USERS ###`
			`# Clone oobabooga/text-generation-webui`
			`RUN git clone https://github.com/oobabooga/text-generation-webui /src`
			# To use local source: comment out the git clone command then set the build arg `LCL_SRC_DIR`
			`#ARG LCL_SRC_DIR="text-generation-webui"`
			`#COPY ${LCL_SRC_DIR} /src`
			`#################################`
			`# Copy source to app`
			`RUN cp -ar /src /app`
			`# Install oobabooga/text-generation-webui`
			`RUN --mount=type=cache,target=/root/.cache/pip pip3 install -r /app/requirements.txt`
			`# Install extensions`
			`COPY ./scripts/build_extensions.sh /scripts/build_extensions.sh`
			`RUN --mount=type=cache,target=/root/.cache/pip \`
			`chmod +x /scripts/build_extensions.sh && . /scripts/build_extensions.sh`
			`# Clone default GPTQ`
			`RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda /app/repositories/GPTQ-for-LLaMa`
			`# Build and install default GPTQ ('quant_cuda')`
			`ARG TORCH_CUDA_ARCH_LIST="6.1;7.0;7.5;8.0;8.6+PTX"`
			`RUN cd /app/repositories/GPTQ-for-LLaMa/ && python3 setup_cuda.py install`
Integrate AutoGPTQ into base image 1 year ago			`# Install auto-gptq`
			`RUN cd /app/repositories/ && git clone https://github.com/PanQiWei/AutoGPTQ.git && \`
			`cd AutoGPTQ && pip3 install .`
Initial commit 1 year ago

Implement llama-cpp with CUBLAS Also switch to devel image for deployment - it will be more flexible for variant builds. 1 year ago			`FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 AS base`
Initial commit 1 year ago			`# Runtime pre-reqs`
			`RUN apt-get update && apt-get install --no-install-recommends -y \`
Integrate AutoGPTQ into base image 1 year ago			`python3-venv python3-dev git`
Initial commit 1 year ago			`# Copy app and src`
			`COPY --from=app_base /app /app`
			`COPY --from=app_base /src /src`
			`# Copy and activate venv`
			`COPY --from=app_base /venv /venv`
			`ENV VIRTUAL_ENV=/venv`
			`RUN python3 -m venv $VIRTUAL_ENV`
			`ENV PATH="$VIRTUAL_ENV/bin:$PATH"`
			`# Finalise app setup`
			`WORKDIR /app`
			`EXPOSE 7860`
			`EXPOSE 5000`
			`EXPOSE 5005`
			`# Required for Python print statements to appear in logs`
			`ENV PYTHONUNBUFFERED=1`
			`# Run`
			`COPY ./scripts/docker-entrypoint.sh /scripts/docker-entrypoint.sh`
			`RUN chmod +x /scripts/docker-entrypoint.sh`
			`ENTRYPOINT ["/scripts/docker-entrypoint.sh"]`


			`# VARIANT BUILDS`
			`FROM base AS cuda`
			`RUN echo "CUDA" >> /variant.txt`
			`RUN apt-get install --no-install-recommends -y git python3-dev python3-pip`
			`RUN rm -rf /app/repositories/GPTQ-for-LLaMa && \`
			`git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b cuda /app/repositories/GPTQ-for-LLaMa`
			`RUN pip3 uninstall -y quant-cuda && \`
			`pip3 install -r /app/repositories/GPTQ-for-LLaMa/requirements.txt`
			`ENV EXTRA_LAUNCH_ARGS=""`
			`CMD ["python3", "/app/server.py"]`

			`FROM base AS triton`
			`RUN echo "TRITON" >> /variant.txt`
			`RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip`
			`RUN rm -rf /app/repositories/GPTQ-for-LLaMa && \`
			`git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa -b triton /app/repositories/GPTQ-for-LLaMa`
			`RUN pip3 uninstall -y quant-cuda && \`
			`pip3 install -r /app/repositories/GPTQ-for-LLaMa/requirements.txt`
			`ENV EXTRA_LAUNCH_ARGS=""`
			`CMD ["python3", "/app/server.py"]`

Implement llama-cpp with CUBLAS Also switch to devel image for deployment - it will be more flexible for variant builds. 1 year ago			`FROM base AS llama-cublas`
			`RUN echo "LLAMA-CUBLAS" >> /variant.txt`
			`RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip`
			`ENV LLAMA_CUBLAS=1`
			`RUN pip uninstall -y llama-cpp-python && \`
			`CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python`
			`ENV EXTRA_LAUNCH_ARGS=""`
			`CMD ["python3", "/app/server.py"]`
Initial commit 1 year ago
Implement `monkey-patch` variant 1 year ago			`FROM base AS monkey-patch`
			`RUN echo "4-BIT MONKEY-PATCH" >> /variant.txt`
			`RUN apt-get install --no-install-recommends -y git python3-dev build-essential python3-pip`
			`RUN git clone https://github.com/johnsmith0031/alpaca_lora_4bit /app/repositories/alpaca_lora_4bit && \`
			`cd /app/repositories/alpaca_lora_4bit && git checkout 2f704b93c961bf202937b10aac9322b092afdce0`
			`ARG TORCH_CUDA_ARCH_LIST="8.6"`
			`RUN pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit`
			`ENV EXTRA_LAUNCH_ARGS=""`
			`CMD ["python3", "/app/server.py", "--monkey-patch"]`

Initial commit 1 year ago			`FROM base AS default`
			`RUN echo "DEFAULT" >> /variant.txt`
			`ENV EXTRA_LAUNCH_ARGS=""`
			`CMD ["python3", "/app/server.py"]`