support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
fix-metal-build
Jared Van Bortel 1 month ago committed by GitHub
parent a618ca5699
commit d2a99d9bc6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -97,7 +97,9 @@ jobs:
command: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk patchelf
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk patchelf cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
- run:
name: Installing Qt
command: |
@ -121,6 +123,7 @@ jobs:
set -eo pipefail
export CMAKE_PREFIX_PATH=~/Qt/6.5.1/gcc_64/lib/cmake
export PATH=$PATH:$HOME/Qt/Tools/QtInstallerFramework/4.7/bin
export PATH=$PATH:/usr/local/cuda/bin
mkdir build
cd build
mkdir upload
@ -162,6 +165,11 @@ jobs:
command: |
Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
.\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
- run:
name: Build
command: |
@ -218,7 +226,9 @@ jobs:
command: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
- run:
name: Installing Qt
command: |
@ -235,6 +245,7 @@ jobs:
name: Build
command: |
export CMAKE_PREFIX_PATH=~/Qt/6.5.1/gcc_64/lib/cmake
export PATH=$PATH:/usr/local/cuda/bin
~/Qt/Tools/CMake/bin/cmake -DCMAKE_BUILD_TYPE=Release -S gpt4all-chat -B build
~/Qt/Tools/CMake/bin/cmake --build build --target all
@ -269,6 +280,11 @@ jobs:
command: |
Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
.\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
- run:
name: Build
command: |
@ -394,12 +410,15 @@ jobs:
command: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cmake build-essential vulkan-sdk
sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
pip install setuptools wheel cmake
- run:
name: Build C library
command: |
export PATH=$PATH:/usr/local/cuda/bin
git submodule update --init --recursive
cd gpt4all-backend
cmake -B build
@ -459,6 +478,11 @@ jobs:
command: |
Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
.\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
- run:
name: Install dependencies
command:
@ -530,11 +554,14 @@ jobs:
command: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cmake build-essential vulkan-sdk
sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
- run:
name: Build Libraries
command: |
export PATH=$PATH:/usr/local/cuda/bin
cd gpt4all-backend
mkdir -p runtimes/build
cd runtimes/build
@ -599,6 +626,11 @@ jobs:
command: |
Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
.\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
- run:
name: Install dependencies
command: |
@ -642,6 +674,11 @@ jobs:
command: |
Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
.\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
- run:
name: Install dependencies
command: |

@ -1,30 +0,0 @@
Software for Open Models License (SOM)
Version 1.0 dated August 30th, 2023
This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
1. Definitions
The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
A “Model” is the output of a machine learning algorithm, and excludes the Software.
“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
2. Grant of Rights. Subject to the conditions and limitations in section 3:
(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software. No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
3. Conditions and Limitations
(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms.
(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
(C) No Trademark License. This license does not grant you rights to use the Licensors name, logo, or trademarks.
(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.

@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.16)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(APPLE)
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
if(BUILD_UNIVERSAL)
if (APPLE)
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
else()
option(LLMODEL_KOMPUTE "llmodel: use Kompute" ON)
option(LLMODEL_VULKAN "llmodel: use Vulkan" OFF)
option(LLMODEL_CUDA "llmodel: use CUDA" ON)
option(LLMODEL_ROCM "llmodel: use ROCm" OFF)
endif()
if (APPLE)
if (BUILD_UNIVERSAL)
# Build a Universal binary on macOS
# This requires that the found Qt library is compiled as Universal binaries.
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
else()
# Build for the host architecture on macOS
if(NOT CMAKE_OSX_ARCHITECTURES)
if (NOT CMAKE_OSX_ARCHITECTURES)
set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
endif()
endif()
@ -39,11 +47,35 @@ else()
message(STATUS "Interprocedural optimization support detected")
endif()
set(DIRECTORY llama.cpp-mainline)
include(llama.cpp.cmake)
set(BUILD_VARIANTS default avxonly)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
set(BUILD_VARIANTS)
set(GPTJ_BUILD_VARIANT cpu)
if (APPLE)
list(APPEND BUILD_VARIANTS metal)
endif()
if (LLMODEL_KOMPUTE)
list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
set(GPTJ_BUILD_VARIANT kompute)
else()
list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
endif()
if (LLMODEL_VULKAN)
list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
endif()
if (LLMODEL_CUDA)
include(CheckLanguage)
check_language(CUDA)
if (NOT CMAKE_CUDA_COMPILER)
message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
endif()
enable_language(CUDA)
list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
endif()
if (LLMODEL_ROCM)
enable_language(HIP)
list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
endif()
set(CMAKE_VERBOSE_MAKEFILE ON)
@ -51,24 +83,34 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
# Go through each build variant
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
# Determine flags
if (BUILD_VARIANT STREQUAL avxonly)
set(GPT4ALL_ALLOW_NON_AVX NO)
if (BUILD_VARIANT MATCHES avxonly)
set(GPT4ALL_ALLOW_NON_AVX OFF)
else()
set(GPT4ALL_ALLOW_NON_AVX YES)
set(GPT4ALL_ALLOW_NON_AVX ON)
endif()
set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
if (BUILD_VARIANT STREQUAL metal)
set(LLAMA_METAL YES)
else()
set(LLAMA_METAL NO)
set(LLAMA_METAL OFF)
set(LLAMA_KOMPUTE OFF)
set(LLAMA_VULKAN OFF)
set(LLAMA_CUDA OFF)
set(LLAMA_ROCM OFF)
if (BUILD_VARIANT MATCHES metal)
set(LLAMA_METAL ON)
elseif (BUILD_VARIANT MATCHES kompute)
set(LLAMA_KOMPUTE ON)
elseif (BUILD_VARIANT MATCHES vulkan)
set(LLAMA_VULKAN ON)
elseif (BUILD_VARIANT MATCHES cuda)
set(LLAMA_CUDA ON)
elseif (BUILD_VARIANT MATCHES rocm)
set(LLAMA_HIPBLAS ON)
endif()
# Include GGML
set(LLAMA_K_QUANTS YES)
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
include_ggml(-mainline-${BUILD_VARIANT})
# Function for preparing individual implementations
function(prepare_target TARGET_NAME BASE_LIB)
@ -93,11 +135,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
prepare_target(llamamodel-mainline llama-mainline)
if (NOT LLAMA_METAL)
if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
add_library(gptj-${BUILD_VARIANT} SHARED
gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
prepare_target(gptj llama-mainline)
endif()
if (BUILD_VARIANT STREQUAL cuda)
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
endif()
endforeach()
add_library(llmodel

@ -1 +1 @@
Subproject commit a3f03b7e793ee611c4918235d4532ee535a9530d
Subproject commit 40bac11e427f2307305b86c322cb366bb95fcb8a

File diff suppressed because it is too large Load Diff

@ -22,7 +22,11 @@
#include <llama.h>
#include <ggml.h>
#ifdef GGML_USE_KOMPUTE
#include <ggml-kompute.h>
# include <ggml-kompute.h>
#elif GGML_USE_VULKAN
# include <ggml-vulkan.h>
#elif GGML_USE_CUDA
# include <ggml-cuda.h>
#endif
using namespace std::string_literals;
@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;
static const char * const modelType_ = "LLaMA";
// note: same order as LLM_ARCH_NAMES in llama.cpp
static const std::vector<const char *> KNOWN_ARCHES {
"baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
"persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
"llama",
"falcon",
// "grok", -- 314B parameters
"gpt2",
// "gptj", -- no inference code
// "gptneox", -- no inference code
"mpt",
"baichuan",
"starcoder",
// "persimmon", -- CUDA generates garbage
"refact",
"bert",
"nomic-bert",
"bloom",
"stablelm",
"qwen",
"qwen2",
"qwen2moe",
"phi2",
"phi3",
// "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
"codeshell",
"orion",
"internlm2",
// "minicpm", -- CUDA generates garbage
"gemma",
"starcoder2",
// "mamba", -- CUDA missing SSM_CONV
"xverse",
"command-r",
// "dbrx", -- 16x12B parameters
"olmo",
};
static const std::vector<const char *> EMBEDDING_ARCHES {
"bert", "nomic-bert"
"bert", "nomic-bert",
};
static bool is_embedding_arch(const std::string &arch) {
@ -170,6 +205,7 @@ struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
@ -313,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
d_ptr->backend_name = "cpu"; // default
#ifdef GGML_USE_KOMPUTE
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
if (d_ptr->device != -1) {
d_ptr->model_params.main_gpu = d_ptr->device;
d_ptr->model_params.n_gpu_layers = ngl;
d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
}
#elif defined(GGML_USE_METAL)
(void)ngl;
@ -337,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
if (!d_ptr->model) {
fflush(stdout);
d_ptr->device = -1;
d_ptr->deviceName.clear();
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
return false;
}
@ -379,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
llama_free_model(d_ptr->model);
d_ptr->model = nullptr;
d_ptr->device = -1;
d_ptr->deviceName.clear();
return false;
}
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
#ifdef GGML_USE_KOMPUTE
if (usingGPUDevice()) {
#ifdef GGML_USE_KOMPUTE
if (llama_verbose()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
}
d_ptr->backend_name = "kompute";
}
#elif defined(GGML_USE_VULKAN)
d_ptr->backend_name = "vulkan";
#elif defined(GGML_USE_CUDA)
d_ptr->backend_name = "cuda";
#endif
}
m_supportsEmbedding = isEmbedding;
m_supportsCompletion = !isEmbedding;
@ -452,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
std::string LLamaModel::tokenToString(Token id) const
{
return llama_token_to_piece(d_ptr->ctx, id);
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
GGML_ASSERT(check == -n_tokens);
}
else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
@ -517,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
return get_arch_key_u32(modelPath, "block_count");
}
#ifdef GGML_USE_VULKAN
static const char *getVulkanVendorName(uint32_t vendorID) {
switch (vendorID) {
case 0x10DE: return "nvidia";
case 0x1002: return "amd";
case 0x8086: return "intel";
default: return "unknown";
}
}
#endif
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
{
#ifdef GGML_USE_KOMPUTE
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
size_t count = 0;
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
if (vkDevices) {
#ifdef GGML_USE_KOMPUTE
auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
#elif defined(GGML_USE_VULKAN)
(void)memoryRequired; // hasn't been used since GGUF was added
auto *lcppDevices = ggml_vk_available_devices(&count);
#else // defined(GGML_USE_CUDA)
(void)memoryRequired;
auto *lcppDevices = ggml_cuda_available_devices(&count);
#endif
if (lcppDevices) {
std::vector<LLModel::GPUDevice> devices;
devices.reserve(count);
for (size_t i = 0; i < count; ++i) {
auto & dev = vkDevices[i];
auto & dev = lcppDevices[i];
devices.emplace_back(
#ifdef GGML_USE_KOMPUTE
/* backend = */ "kompute",
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ dev.vendor
#elif defined(GGML_USE_VULKAN)
/* backend = */ "vulkan",
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ getVulkanVendorName(dev.vendorID)
#else // defined(GGML_USE_CUDA)
/* backend = */ "cuda",
/* index = */ dev.index,
/* type = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ "nvidia"
#endif
);
#ifndef GGML_USE_CUDA
ggml_vk_device_destroy(&dev);
#else
ggml_cuda_device_destroy(&dev);
#endif
}
free(vkDevices);
free(lcppDevices);
return devices;
}
#else
(void)memoryRequired;
std::cerr << __func__ << ": built without Kompute\n";
std::cerr << __func__ << ": built without a GPU backend\n";
#endif
return {};
@ -552,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
auto devices = availableGPUDevices(memoryRequired);
auto dev_it = devices.begin();
#ifndef GGML_USE_CUDA
if (name == "amd" || name == "nvidia" || name == "intel") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
} else
#endif
if (name != "gpu") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
}
if (dev_it < devices.end()) {
d_ptr->device = dev_it->index;
d_ptr->deviceName = dev_it->name;
return true;
}
return false;
#elif defined(GGML_USE_KOMPUTE)
ggml_vk_device device;
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
if (ok) {
d_ptr->device = device.index;
d_ptr->deviceName = device.name;
ggml_vk_device_destroy(&device);
return true;
}
#else
@ -568,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
(void)unavail_reason;
auto devices = availableGPUDevices();
auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
d_ptr->device = device;
d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
return true;
#else
(void)device;
if (unavail_reason) {
*unavail_reason = "built without Kompute";
*unavail_reason = "built without a GPU backend";
}
return false;
#endif
@ -583,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
bool LLamaModel::hasGPUDevice() const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->device != -1;
#else
return false;
@ -592,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const
bool LLamaModel::usingGPUDevice() const
{
#if defined(GGML_USE_KOMPUTE)
bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
bool hasDevice;
#ifdef GGML_USE_KOMPUTE
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
assert(!hasDevice || ggml_vk_has_device());
return hasDevice;
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
#elif defined(GGML_USE_METAL)
return true;
hasDevice = true;
#else
return false;
hasDevice = false;
#endif
return hasDevice;
}
const char *LLamaModel::backendName() const {
@ -608,11 +734,11 @@ const char *LLamaModel::backendName() const {
}
const char *LLamaModel::gpuDeviceName() const {
#if defined(GGML_USE_KOMPUTE)
if (usingGPUDevice()) {
return ggml_vk_current_device().name;
}
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->deviceName.c_str();
#endif
}
return nullptr;
}

@ -30,7 +30,7 @@ public:
size_t restoreState(const uint8_t *src) override;
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
bool hasGPUDevice() const override;

@ -12,12 +12,21 @@
#include <regex>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifndef __APPLE__
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
#elif defined(__aarch64__)
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
#else
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
#endif
std::string s_implementations_search_path = ".";
#if !(defined(__x86_64__) || defined(_M_X64))
@ -86,11 +95,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
static auto* libs = new std::vector<Implementation>([] () {
std::vector<Implementation> fres;
std::string impl_name_re = "(gptj|llamamodel-mainline)";
std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
if (cpu_supports_avx2() == 0) {
impl_name_re += "-avxonly";
} else {
impl_name_re += "-(default|metal)";
}
std::regex re(impl_name_re);
auto search_in_directory = [&](const std::string& paths) {
@ -125,6 +132,13 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
return *libs;
}
static std::string applyCPUVariant(const std::string &buildVariant) {
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
return buildVariant + "-avxonly";
}
return buildVariant;
}
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
bool buildVariantMatched = false;
std::optional<std::string> archName;
@ -142,110 +156,124 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
}
if (!buildVariantMatched)
throw MissingImplementationError("Could not find any implementations for build variant: " + buildVariant);
return nullptr;
if (!archName)
throw UnsupportedModelError("Unsupported file format");
throw BadArchError(std::move(*archName));
}
LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
// Get correct implementation
const Implementation* impl = nullptr;
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
if (buildVariant == "auto") {
size_t total_mem = getSystemTotalRAMInBytes();
try {
impl = implementation(modelPath.c_str(), "metal");
} catch (const std::exception &e) {
// fall back to CPU
}
if(impl) {
LLModel* metalimpl = impl->m_construct();
metalimpl->m_implementation = impl;
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
* most (all?) places where this is called, causing underestimation of required
* memory. */
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
float req_to_total = (float) req_mem / (float) total_mem;
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
std::vector<std::string> desiredBackends;
if (backend != "auto") {
desiredBackends.push_back(backend);
} else {
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
}
for (const auto &desiredBackend: desiredBackends) {
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
if (impl) {
// Construct llmodel implementation
auto *fres = impl->m_construct();
fres->m_implementation = impl;
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
* most (all?) places where this is called, causing underestimation of required
* memory. */
if (backend == "auto" && desiredBackend == "metal") {
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
if (req_to_total >= 0.53) {
delete metalimpl;
impl = nullptr;
} else {
return metalimpl;
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
delete fres;
continue;
}
}
#else
(void)n_ctx;
#endif
return fres;
}
#else
(void)n_ctx;
#endif
if (!impl) {
//TODO: Auto-detect CUDA/OpenCL
if (buildVariant == "auto") {
if (cpu_supports_avx2() == 0) {
buildVariant = "avxonly";
} else {
buildVariant = "default";
}
}
impl = implementation(modelPath.c_str(), buildVariant);
}
// Construct and return llmodel implementation
auto fres = impl->m_construct();
fres->m_implementation = impl;
return fres;
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
}
LLModel *LLModel::Implementation::constructDefaultLlama() {
static std::unique_ptr<LLModel> llama([]() -> LLModel * {
const std::vector<LLModel::Implementation> *impls;
try {
impls = &implementationList();
} catch (const std::runtime_error &e) {
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
return nullptr;
}
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
const std::vector<Implementation> *impls;
try {
impls = &implementationList();
} catch (const std::runtime_error &e) {
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
return nullptr;
}
std::vector<std::string> desiredBackends;
if (backend) {
desiredBackends.push_back(backend.value());
} else {
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
}
const Implementation *impl = nullptr;
for (const auto &desiredBackend: desiredBackends) {
auto cacheIt = implCache.find(desiredBackend);
if (cacheIt != implCache.end())
return cacheIt->second.get(); // cached
const LLModel::Implementation *impl = nullptr;
for (const auto &i: *impls) {
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
impl = &i;
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
impl = &i;
break;
}
}
if (!impl) {
std::cerr << __func__ << ": could not find llama.cpp implementation\n";
return nullptr;
if (impl) {
auto *fres = impl->m_construct();
fres->m_implementation = impl;
implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
return fres;
}
}
auto fres = impl->m_construct();
fres->m_implementation = impl;
return fres;
}());
return llama.get();
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
return nullptr;
}
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
auto *llama = constructDefaultLlama();
if (llama) { return llama->availableGPUDevices(memoryRequired); }
return {};
std::vector<LLModel::GPUDevice> devices;
#ifndef __APPLE__
static const std::string backends[] = {"kompute", "cuda"};
for (const auto &backend: backends) {
auto *llama = constructGlobalLlama(backend);
if (llama) {
auto backendDevs = llama->availableGPUDevices(memoryRequired);
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
}
}
#endif
return devices;
}
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
auto *llama = constructDefaultLlama();
auto *llama = constructGlobalLlama();
return llama ? llama->maxContextLength(modelPath) : -1;
}
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
auto *llama = constructDefaultLlama();
auto *llama = constructGlobalLlama();
return llama ? llama->layerCount(modelPath) : -1;
}
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
auto *llama = constructDefaultLlama();
auto *llama = constructGlobalLlama();
return llama && llama->isEmbeddingModel(modelPath);
}

@ -1,6 +1,7 @@
#ifndef LLMODEL_H
#define LLMODEL_H
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <functional>
@ -8,8 +9,11 @@
#include <optional>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
using namespace std::string_literals;
#define LLMODEL_MAX_PROMPT_BATCH 128
class Dlhandle;
@ -41,14 +45,35 @@ public:
};
struct GPUDevice {
const char *backend;
int index;
int type;
size_t heapSize;
std::string name;
std::string vendor;
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
vendor(std::move(vendor)) {}
std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; }
static std::string updateSelectionName(const std::string &name) {
if (name == "Auto" || name == "CPU" || name == "Metal")
return name;
auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
return name.starts_with(entry.second + ": ");
});
if (it != m_backendNames.end())
return name;
return "Vulkan: " + name; // previously, there were only Vulkan devices
}
private:
static inline const std::unordered_map<std::string, std::string> m_backendNames {
{"cuda", "CUDA"}, {"kompute", "Vulkan"},
};
};
class Implementation {
@ -60,7 +85,7 @@ public:
std::string_view modelType() const { return m_modelType; }
std::string_view buildVariant() const { return m_buildVariant; }
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
static int32_t maxContextLength(const std::string &modelPath);
static int32_t layerCount(const std::string &modelPath);
@ -76,7 +101,7 @@ public:
static const std::vector<Implementation> &implementationList();
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
static LLModel *constructDefaultLlama();
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
char *(*m_getFileArch)(const char *fname);
bool (*m_isArchSupported)(const char *arch);

@ -31,10 +31,10 @@ static void llmodel_set_error(const char **errptr, const char *message) {
}
}
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
LLModel *llModel;
try {
llModel = LLModel::Implementation::construct(model_path, build_variant);
llModel = LLModel::Implementation::construct(model_path, backend);
} catch (const std::exception& e) {
llmodel_set_error(error, e.what());
return nullptr;
@ -248,6 +248,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
for (unsigned i = 0; i < devices.size(); i++) {
const auto &dev = devices[i];
auto &cdev = c_devices[i];
cdev.backend = dev.backend;
cdev.index = dev.index;
cdev.type = dev.type;
cdev.heapSize = dev.heapSize;

@ -48,6 +48,7 @@ struct llmodel_prompt_context {
};
struct llmodel_gpu_device {
const char * backend;
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
* Embedding cancellation callback for use with llmodel_embed.
* @param batch_sizes The number of tokens in each batch that will be embedded.
* @param n_batch The number of batches that will be embedded.
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
* @return True to cancel llmodel_embed, false to continue.
*/
typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
* Create a llmodel instance.
* Recognises correct model type from file at model_path
* @param model_path A string representing the path to the model file; will only be used to detect model type.
* @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
* @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
* @param error A pointer to a string; will only be set on error.
* @return A pointer to the llmodel_model instance; NULL on error.
*/
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
/**
* Destroy a llmodel instance.

@ -23,9 +23,9 @@ As an alternative to downloading via pip, you may build the Python bindings from
### Prerequisites
On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
macOS users do not need Vulkan, as GPT4All will use Metal instead.
On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
### Building the python bindings

@ -71,6 +71,7 @@ class LLModelPromptContext(ctypes.Structure):
class LLModelGPUDevice(ctypes.Structure):
_fields_ = [
("backend", ctypes.c_char_p),
("index", ctypes.c_int32),
("type", ctypes.c_int32),
("heapSize", ctypes.c_size_t),
@ -200,9 +201,11 @@ class LLModel:
Maximum size of context window
ngl : int
Number of GPU layers to use (Vulkan)
backend : str
Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
"""
def __init__(self, model_path: str, n_ctx: int, ngl: int):
def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
self.model_path = model_path.encode()
self.n_ctx = n_ctx
self.ngl = ngl
@ -212,7 +215,7 @@ class LLModel:
# Construct a model implementation
err = ctypes.c_char_p()
model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
if model is None:
s = err.value
raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
@ -231,7 +234,7 @@ class LLModel:
raise ValueError("Attempted operation on a closed LLModel")
@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
if self.model is None:
self._raise_closed()
return llmodel.llmodel_model_backend_name(self.model).decode()
@ -258,7 +261,7 @@ class LLModel:
devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
if not devices_ptr:
raise ValueError("Unable to retrieve available GPU devices")
return [d.name.decode() for d in devices_ptr[:num_devices.value]]
return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]
def init_gpu(self, device: str):
if self.model is None:

@ -5,6 +5,7 @@ from __future__ import annotations
import hashlib
import os
import platform
import re
import sys
import time
@ -44,7 +45,7 @@ class Embed4All:
MIN_DIMENSIONALITY = 64
def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
"""
Constructor
@ -172,7 +173,7 @@ class GPT4All:
model_type: str | None = None,
allow_download: bool = True,
n_threads: int | None = None,
device: str | None = "cpu",
device: str | None = None,
n_ctx: int = 2048,
ngl: int = 100,
verbose: bool = False,
@ -190,30 +191,56 @@ class GPT4All:
n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
device: The processing unit on which the GPT4All model will run. It can be set to:
- "cpu": Model will run on the central processing unit.
- "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
- "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
- "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
- "kompute": Use the best GPU provided by the Kompute backend.
- "cuda": Use the best GPU provided by the CUDA backend.
- "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
- A specific device name from the list returned by `GPT4All.list_gpus()`.
Default is "cpu".
Default is Metal on ARM64 macOS, "cpu" otherwise.
Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
n_ctx: Maximum size of context window
ngl: Number of GPU layers to use (Vulkan)
verbose: If True, print debug messages.
"""
self.model_type = model_type
self._history: list[MessageType] | None = None
self._current_prompt_template: str = "{0}"
device_init = None
if sys.platform == 'darwin':
if device is None:
backend = 'auto' # 'auto' is effectively 'metal' due to currently non-functional fallback
elif device == 'cpu':
backend = 'cpu'
else:
if platform.machine() != 'arm64' or device != 'gpu':
raise ValueError(f'Unknown device for this platform: {device}')
backend = 'metal'
else:
backend = 'kompute'
if device is None or device == 'cpu':
pass # use kompute with no device
elif device in ('cuda', 'kompute'):
backend = device
device_init = 'gpu'
elif device.startswith('cuda:'):
backend = 'cuda'
device_init = device.removeprefix('cuda:')
else:
device_init = device.removeprefix('kompute:')
# Retrieve model and download if allowed
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
self.model = LLModel(self.config["path"], n_ctx, ngl)
if device is not None and device != "cpu":
self.model.init_gpu(device)
self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
if device_init is not None:
self.model.init_gpu(device_init)
self.model.load_model()
# Set n_threads
if n_threads is not None:
self.model.set_thread_count(n_threads)
self._history: list[MessageType] | None = None
self._current_prompt_template: str = "{0}"
def __enter__(self) -> Self:
return self
@ -227,13 +254,13 @@ class GPT4All:
self.model.close()
@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
return self.model.backend
@property
def device(self) -> str | None:
"""The name of the GPU device currently in use, or None for backends other than Kompute."""
"""The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
return self.model.device
@property

@ -45,7 +45,7 @@ def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
d = os.path.join(dest_dir, item)
shutil.copy2(s, d)
files_copied += 1
if item.endswith(lib_ext) or item.endswith('.metal'):
if item.endswith(lib_ext) or item.endswith('.metallib'):
s = os.path.join(dirpath, item)
d = os.path.join(dest_build_dir, item)
shutil.copy2(s, d)
@ -68,7 +68,7 @@ def get_long_description():
setup(
name=package_name,
version="2.6.0",
version="2.7.0",
description="Python bindings for GPT4All",
long_description=get_long_description(),
long_description_content_type="text/markdown",

@ -17,8 +17,8 @@ if(APPLE)
endif()
set(APP_VERSION_MAJOR 2)
set(APP_VERSION_MINOR 7)
set(APP_VERSION_PATCH 6)
set(APP_VERSION_MINOR 8)
set(APP_VERSION_PATCH 0)
set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
# Include the binary directory for the generated header file
@ -65,7 +65,7 @@ add_subdirectory(../gpt4all-backend llmodel)
set(METAL_SHADER_FILE)
if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal)
set(METAL_SHADER_FILE ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
endif()
set(APP_ICON_RESOURCE)
@ -185,7 +185,6 @@ if(METAL_SHADER_FILE)
set_target_properties(chat PROPERTIES
RESOURCE ${METAL_SHADER_FILE}
)
configure_file(${METAL_SHADER_FILE} bin/ggml-metal.metal COPYONLY)
endif()
target_compile_definitions(chat
@ -207,18 +206,61 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
endif()
install(TARGETS chat DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llmodel DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(
TARGETS llmodel
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN} # .dll
)
# We should probably iterate through the list of the cmake for backend, but these need to be installed
# to the this component's dir for the finicky qt installer to work
install(TARGETS gptj-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS gptj-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llama-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llama-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llamamodel-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llamamodel-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
if(APPLE)
install(TARGETS llamamodel-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
if (LLMODEL_KOMPUTE)
set(MODEL_IMPL_TARGETS
llamamodel-mainline-kompute
llamamodel-mainline-kompute-avxonly
gptj-kompute
gptj-kompute-avxonly
)
else()
set(MODEL_IMPL_TARGETS
llamamodel-mainline-cpu
llamamodel-mainline-cpu-avxonly
gptj-cpu
gptj-cpu-avxonly
)
endif()
if (APPLE)
list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
endif()
install(
TARGETS ${MODEL_IMPL_TARGETS}
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
)
if (LLMODEL_CUDA)
set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
install(
TARGETS llamamodel-mainline-cuda
llamamodel-mainline-cuda-avxonly
RUNTIME_DEPENDENCY_SET llama-cuda-deps
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
)
if (WIN32)
install(
RUNTIME_DEPENDENCY_SET llama-cuda-deps
PRE_EXCLUDE_REGEXES "^(nvcuda|api-ms-.*)\\.dll$"
POST_INCLUDE_REGEXES "(^|[/\\\\])(lib)?(cuda|cublas)" POST_EXCLUDE_REGEXES .
DIRECTORIES "${CUDAToolkit_BIN_DIR}"
DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}
)
endif()
endif()
set(CPACK_GENERATOR "IFW")

@ -6,9 +6,9 @@ gpt4all-chat from source.
## Prerequisites
On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
macOS users do not need Vulkan, as GPT4All will use Metal instead.
On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
## Note for Linux users

@ -143,7 +143,7 @@ void ChatLLM::handleThreadStarted()
void ChatLLM::handleForceMetalChanged(bool forceMetal)
{
#if defined(Q_OS_MAC) && defined(__arm__)
#if defined(Q_OS_MAC) && defined(__aarch64__)
m_forceMetal = forceMetal;
if (isModelLoaded() && m_shouldBeLoaded) {
m_reloadingToChangeVariant = true;
@ -324,19 +324,29 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
QElapsedTimer modelLoadTimer;
modelLoadTimer.start();
auto requestedDevice = MySettings::globalInstance()->device();
auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
m_ctx.n_ctx = n_ctx;
auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);
std::string buildVariant = "auto";
#if defined(Q_OS_MAC) && defined(__arm__)
if (m_forceMetal)
buildVariant = "metal";
std::string backend = "auto";
#ifdef Q_OS_MAC
if (requestedDevice == "CPU") {
backend = "cpu";
} else if (m_forceMetal) {
#ifdef __aarch64__
backend = "metal";
#endif
}
#else // !defined(Q_OS_MAC)
if (requestedDevice.startsWith("CUDA: "))
backend = "cuda";
#endif
QString constructError;
m_llModelInfo.model.reset();
try {
auto *model = LLModel::Implementation::construct(filePath.toStdString(), buildVariant, n_ctx);
auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
m_llModelInfo.model.reset(model);
} catch (const LLModel::MissingImplementationError &e) {
modelLoadProps.insert("error", "missing_model_impl");
@ -378,6 +388,8 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
{
const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
// Pick the best device
// NB: relies on the fact that Kompute devices are listed first
if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
defaultDevice = &availableDevices.front();
float memGB = defaultDevice->heapSize / float(1024 * 1024 * 1024);
@ -387,16 +399,18 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
}
}
const QString requestedDevice = MySettings::globalInstance()->device();
bool isMetal = m_llModelInfo.model->implementation().buildVariant() == "metal";
QString actualDevice("CPU");
// Pick the best match for the device
QString actualDevice = isMetal ? "Metal" : "CPU";
if (!isMetal && requestedDevice != "CPU") {
#if defined(Q_OS_MAC) && defined(__aarch64__)
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
actualDevice = "Metal";
#else
if (requestedDevice != "CPU") {
const auto *device = defaultDevice;
if (requestedDevice != "Auto") {
// Use the selected device
for (const LLModel::GPUDevice &d : availableDevices) {
if (QString::fromStdString(d.name) == requestedDevice) {
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
device = &d;
break;
}
@ -409,14 +423,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
} else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
} else {
actualDevice = QString::fromStdString(device->name);
actualDevice = QString::fromStdString(device->reportedName());
modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
}
}
#endif
// Report which device we're actually using
emit reportDevice(actualDevice);
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
if (!m_shouldBeLoaded) {

@ -5,10 +5,7 @@ set(DATA_DIR ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN
set(BIN_DIR ${DATA_DIR}/bin)
set(Qt6_ROOT_DIR "@Qt6_ROOT_DIR@")
set(ENV{LD_LIBRARY_PATH} "${BIN_DIR}:${Qt6_ROOT_DIR}/../lib/")
execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2)
file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
file(COPY ${MYLLMODELLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2 -exclude-libs=libcuda.so.1)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
DESTINATION ${DATA_DIR})
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"

@ -4,14 +4,11 @@ set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2)
file(GLOB MYGPTJLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libgptj*)
file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
file(GLOB MYBERTLLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libbert*)
file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
file(COPY ${MYGPTJLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY ${MYLLAMALIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY ${MYBERTLLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY ${MYLLMODELLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"

@ -2,9 +2,6 @@ set(WINDEPLOYQT "@WINDEPLOYQT@")
set(COMPONENT_NAME_MAIN "@COMPONENT_NAME_MAIN@")
set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
execute_process(COMMAND ${WINDEPLOYQT} --qmldir ${CMAKE_CURRENT_SOURCE_DIR} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
file(COPY ${MYLLMODELLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"

@ -65,10 +65,14 @@ MySettings::MySettings()
{
QSettings::setDefaultFormat(QSettings::IniFormat);
std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
QVector<QString> deviceList{ "Auto" };
#if defined(Q_OS_MAC) && defined(__aarch64__)
deviceList << "Metal";
#else
std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
for (LLModel::GPUDevice &d : devices)
deviceList << QString::fromStdString(d.name);
deviceList << QString::fromStdString(d.selectionName());
#endif
deviceList << "CPU";
setDeviceList(deviceList);
}
@ -786,7 +790,23 @@ QString MySettings::device() const
{
QSettings setting;
setting.sync();
return setting.value("device", default_device).toString();
auto value = setting.value("device");
if (!value.isValid())
return default_device;
auto device = value.toString();
if (!device.isEmpty()) {
auto deviceStr = device.toStdString();
auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr);
if (newNameStr != deviceStr) {
auto newName = QString::fromStdString(newNameStr);
qWarning() << "updating device name:" << device << "->" << newName;
device = newName;
setting.setValue("device", device);
setting.sync();
}
}
return device;
}
void MySettings::setDevice(const QString &u)

Loading…
Cancel
Save