support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
1 month ago · d2a99d9bc6
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1326 additions and 739 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -97,7 +97,9 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk patchelf
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk patchelf cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Installing Qt
          command: |
@ -121,6 +123,7 @@ jobs:
            set -eo pipefail
            export CMAKE_PREFIX_PATH=~/Qt/6.5.1/gcc_64/lib/cmake
            export PATH=$PATH:$HOME/Qt/Tools/QtInstallerFramework/4.7/bin
+            export PATH=$PATH:/usr/local/cuda/bin
            mkdir build
            cd build
            mkdir upload
@ -162,6 +165,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Build
          command: |
@ -218,7 +226,9 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Installing Qt
          command: |
@ -235,6 +245,7 @@ jobs:
          name: Build
          command: |
            export CMAKE_PREFIX_PATH=~/Qt/6.5.1/gcc_64/lib/cmake
+            export PATH=$PATH:/usr/local/cuda/bin
            ~/Qt/Tools/CMake/bin/cmake -DCMAKE_BUILD_TYPE=Release -S gpt4all-chat -B build
            ~/Qt/Tools/CMake/bin/cmake --build build --target all

@ -269,6 +280,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Build
          command: |
@ -394,12 +410,15 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk
+            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
            pip install setuptools wheel cmake
      - run:
          name: Build C library
          command: |
+            export PATH=$PATH:/usr/local/cuda/bin
            git submodule update --init --recursive
            cd gpt4all-backend
            cmake -B build
@ -459,6 +478,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Install dependencies
          command:
@ -530,11 +554,14 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list            
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk
+            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Build Libraries
          command: |
+            export PATH=$PATH:/usr/local/cuda/bin
            cd gpt4all-backend
            mkdir -p runtimes/build
            cd runtimes/build
@ -599,6 +626,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Install dependencies
          command: |
@ -642,6 +674,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Install dependencies
          command: |
--- a/LICENSE_SOM.txt
+++ b/LICENSE_SOM.txt
@ -1,30 +0,0 @@
-Software for Open Models License (SOM)
-Version 1.0 dated August 30th, 2023
-
-This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
-
-This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
-
-1. Definitions
-The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
-A “Model” is the output of a machine learning algorithm, and excludes the Software.
-“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
-“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
-
-2. Grant of Rights. Subject to the conditions and limitations in section 3:
-(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
-
-(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
-
-3. Conditions and Limitations
-(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
-
-(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
-
-(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
-
-(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
-
-(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
-
-(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.16)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

-if(APPLE)
-  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
-  if(BUILD_UNIVERSAL)
+if (APPLE)
+    option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
+else()
+    option(LLMODEL_KOMPUTE "llmodel: use Kompute"              ON)
+    option(LLMODEL_VULKAN  "llmodel: use Vulkan"               OFF)
+    option(LLMODEL_CUDA    "llmodel: use CUDA"                 ON)
+    option(LLMODEL_ROCM    "llmodel: use ROCm"                 OFF)
+endif()
+
+if (APPLE)
+  if (BUILD_UNIVERSAL)
    # Build a Universal binary on macOS
    # This requires that the found Qt library is compiled as Universal binaries.
    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
  else()
    # Build for the host architecture on macOS
-    if(NOT CMAKE_OSX_ARCHITECTURES)
+    if (NOT CMAKE_OSX_ARCHITECTURES)
      set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
    endif()
  endif()
@ -39,11 +47,35 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

+set(DIRECTORY llama.cpp-mainline)
 include(llama.cpp.cmake)

-set(BUILD_VARIANTS default avxonly)
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
+set(BUILD_VARIANTS)
+set(GPTJ_BUILD_VARIANT cpu)
+if (APPLE)
+    list(APPEND BUILD_VARIANTS metal)
+endif()
+if (LLMODEL_KOMPUTE)
+    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
+    set(GPTJ_BUILD_VARIANT kompute)
+else()
+    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
+endif()
+if (LLMODEL_VULKAN)
+    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
+endif()
+if (LLMODEL_CUDA)
+    include(CheckLanguage)
+    check_language(CUDA)
+    if (NOT CMAKE_CUDA_COMPILER)
+        message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
+    endif()
+    enable_language(CUDA)
+    list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
+endif()
+if (LLMODEL_ROCM)
+    enable_language(HIP)
+    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()

 set(CMAKE_VERBOSE_MAKEFILE ON)
@ -51,24 +83,34 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
-    if (BUILD_VARIANT STREQUAL avxonly)
-        set(GPT4ALL_ALLOW_NON_AVX NO)
+    if (BUILD_VARIANT MATCHES avxonly)
+        set(GPT4ALL_ALLOW_NON_AVX OFF)
    else()
-        set(GPT4ALL_ALLOW_NON_AVX YES)
+        set(GPT4ALL_ALLOW_NON_AVX ON)
    endif()
    set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
    set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
    set(LLAMA_FMA  ${GPT4ALL_ALLOW_NON_AVX})

-    if (BUILD_VARIANT STREQUAL metal)
-        set(LLAMA_METAL YES)
-    else()
-        set(LLAMA_METAL NO)
+    set(LLAMA_METAL   OFF)
+    set(LLAMA_KOMPUTE OFF)
+    set(LLAMA_VULKAN  OFF)
+    set(LLAMA_CUDA    OFF)
+    set(LLAMA_ROCM    OFF)
+    if (BUILD_VARIANT MATCHES metal)
+        set(LLAMA_METAL   ON)
+    elseif (BUILD_VARIANT MATCHES kompute)
+        set(LLAMA_KOMPUTE ON)
+    elseif (BUILD_VARIANT MATCHES vulkan)
+        set(LLAMA_VULKAN  ON)
+    elseif (BUILD_VARIANT MATCHES cuda)
+        set(LLAMA_CUDA    ON)
+    elseif (BUILD_VARIANT MATCHES rocm)
+        set(LLAMA_HIPBLAS ON)
    endif()

    # Include GGML
-    set(LLAMA_K_QUANTS YES)
-    include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
+    include_ggml(-mainline-${BUILD_VARIANT})

    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
@ -93,11 +135,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    prepare_target(llamamodel-mainline llama-mainline)

-    if (NOT LLAMA_METAL)
+    if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
        add_library(gptj-${BUILD_VARIANT} SHARED
            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
        prepare_target(gptj llama-mainline)
    endif()
+
+    if (BUILD_VARIANT STREQUAL cuda)
+        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
+    endif()
 endforeach()

 add_library(llmodel
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +1 @@
-Subproject commit a3f03b7e793ee611c4918235d4532ee535a9530d
+Subproject commit 40bac11e427f2307305b86c322cb366bb95fcb8a
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -22,7 +22,11 @@
 #include <llama.h>
 #include <ggml.h>
 #ifdef GGML_USE_KOMPUTE
-#include <ggml-kompute.h>
+#   include <ggml-kompute.h>
+#elif GGML_USE_VULKAN
+#   include <ggml-vulkan.h>
+#elif GGML_USE_CUDA
+#   include <ggml-cuda.h>
 #endif

 using namespace std::string_literals;
@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;

 static const char * const modelType_ = "LLaMA";

+// note: same order as LLM_ARCH_NAMES in llama.cpp
 static const std::vector<const char *> KNOWN_ARCHES {
-    "baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
-    "persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
+    "llama",
+    "falcon",
+    // "grok", -- 314B parameters
+    "gpt2",
+    // "gptj", -- no inference code
+    // "gptneox", -- no inference code
+    "mpt",
+    "baichuan",
+    "starcoder",
+    // "persimmon", -- CUDA generates garbage
+    "refact",
+    "bert",
+    "nomic-bert",
+    "bloom",
+    "stablelm",
+    "qwen",
+    "qwen2",
+    "qwen2moe",
+    "phi2",
+    "phi3",
+    // "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
+    "codeshell",
+    "orion",
+    "internlm2",
+    // "minicpm", -- CUDA generates garbage
+    "gemma",
+    "starcoder2",
+    // "mamba", -- CUDA missing SSM_CONV
+    "xverse",
+    "command-r",
+    // "dbrx", -- 16x12B parameters
+    "olmo",
 };

 static const std::vector<const char *> EMBEDDING_ARCHES {
-    "bert", "nomic-bert"
+    "bert", "nomic-bert",
 };

 static bool is_embedding_arch(const std::string &arch) {
@ -170,6 +205,7 @@ struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded = false;
    int device = -1;
+    std::string deviceName;
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    llama_model_params model_params;
@ -313,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)

    d_ptr->backend_name = "cpu"; // default

-#ifdef GGML_USE_KOMPUTE
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    if (d_ptr->device != -1) {
        d_ptr->model_params.main_gpu = d_ptr->device;
        d_ptr->model_params.n_gpu_layers = ngl;
+        d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
    }
 #elif defined(GGML_USE_METAL)
    (void)ngl;
@ -337,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    if (!d_ptr->model) {
        fflush(stdout);
        d_ptr->device = -1;
+        d_ptr->deviceName.clear();
        std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
        return false;
    }
@ -379,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        llama_free_model(d_ptr->model);
        d_ptr->model = nullptr;
        d_ptr->device = -1;
+        d_ptr->deviceName.clear();
        return false;
    }

    d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};

-#ifdef GGML_USE_KOMPUTE
    if (usingGPUDevice()) {
+#ifdef GGML_USE_KOMPUTE
        if (llama_verbose()) {
-            std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+            std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
        }
        d_ptr->backend_name = "kompute";
-    }
+#elif defined(GGML_USE_VULKAN)
+        d_ptr->backend_name = "vulkan";
+#elif defined(GGML_USE_CUDA)
+        d_ptr->backend_name = "cuda";
 #endif
+    }

    m_supportsEmbedding = isEmbedding;
    m_supportsCompletion = !isEmbedding;
@ -452,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::

 std::string LLamaModel::tokenToString(Token id) const
 {
-    return llama_token_to_piece(d_ptr->ctx, id);
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
+        GGML_ASSERT(check == -n_tokens);
+    }
+    else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
 }

 LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
@ -517,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
    return get_arch_key_u32(modelPath, "block_count");
 }

+#ifdef GGML_USE_VULKAN
+static const char *getVulkanVendorName(uint32_t vendorID) {
+    switch (vendorID) {
+        case 0x10DE: return "nvidia";
+        case 0x1002: return "amd";
+        case 0x8086: return "intel";
+        default:     return "unknown";
+    }
+}
+#endif
+
 std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
 {
-#ifdef GGML_USE_KOMPUTE
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    size_t count = 0;
-    auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);

-    if (vkDevices) {
+#ifdef GGML_USE_KOMPUTE
+    auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
+#elif defined(GGML_USE_VULKAN)
+    (void)memoryRequired; // hasn't been used since GGUF was added
+    auto *lcppDevices = ggml_vk_available_devices(&count);
+#else // defined(GGML_USE_CUDA)
+    (void)memoryRequired;
+    auto *lcppDevices = ggml_cuda_available_devices(&count);
+#endif
+
+    if (lcppDevices) {
        std::vector<LLModel::GPUDevice> devices;
        devices.reserve(count);

        for (size_t i = 0; i < count; ++i) {
-            auto & dev = vkDevices[i];
+            auto & dev = lcppDevices[i];
+
            devices.emplace_back(
+#ifdef GGML_USE_KOMPUTE
+                /* backend  = */ "kompute",
                /* index    = */ dev.index,
                /* type     = */ dev.type,
                /* heapSize = */ dev.heapSize,
                /* name     = */ dev.name,
                /* vendor   = */ dev.vendor
+#elif defined(GGML_USE_VULKAN)
+                /* backend  = */ "vulkan",
+                /* index    = */ dev.index,
+                /* type     = */ dev.type,
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ getVulkanVendorName(dev.vendorID)
+#else // defined(GGML_USE_CUDA)
+                /* backend  = */ "cuda",
+                /* index    = */ dev.index,
+                /* type     = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ "nvidia"
+#endif
            );
+
+#ifndef GGML_USE_CUDA
            ggml_vk_device_destroy(&dev);
+#else
+            ggml_cuda_device_destroy(&dev);
+#endif
        }

-        free(vkDevices);
+        free(lcppDevices);
        return devices;
    }
 #else
    (void)memoryRequired;
-    std::cerr << __func__ << ": built without Kompute\n";
+    std::cerr << __func__ << ": built without a GPU backend\n";
 #endif

    return {};
@ -552,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq

 bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    auto devices = availableGPUDevices(memoryRequired);
+
+    auto dev_it = devices.begin();
+#ifndef GGML_USE_CUDA
+    if (name == "amd" || name == "nvidia" || name == "intel") {
+        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
+    } else
+#endif
+    if (name != "gpu") {
+        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
+    }
+
+    if (dev_it < devices.end()) {
+        d_ptr->device     = dev_it->index;
+        d_ptr->deviceName = dev_it->name;
+        return true;
+    }
+    return false;
+#elif defined(GGML_USE_KOMPUTE)
    ggml_vk_device device;
    bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
    if (ok) {
        d_ptr->device = device.index;
+        d_ptr->deviceName = device.name;
+        ggml_vk_device_destroy(&device);
        return true;
    }
 #else
@ -568,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n

 bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    (void)unavail_reason;
+    auto devices = availableGPUDevices();
+    auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
    d_ptr->device = device;
+    d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
    return true;
 #else
    (void)device;
    if (unavail_reason) {
-        *unavail_reason = "built without Kompute";
+        *unavail_reason = "built without a GPU backend";
    }
    return false;
 #endif
@ -583,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co

 bool LLamaModel::hasGPUDevice() const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    return d_ptr->device != -1;
 #else
    return false;
@ -592,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const

 bool LLamaModel::usingGPUDevice() const
 {
-#if defined(GGML_USE_KOMPUTE)
-    bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+    bool hasDevice;
+
+#ifdef GGML_USE_KOMPUTE
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
    assert(!hasDevice || ggml_vk_has_device());
-    return hasDevice;
+#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
 #elif defined(GGML_USE_METAL)
-    return true;
+    hasDevice = true;
 #else
-    return false;
+    hasDevice = false;
 #endif
+
+    return hasDevice;
 }

 const char *LLamaModel::backendName() const {
@ -608,11 +734,11 @@ const char *LLamaModel::backendName() const {
 }

 const char *LLamaModel::gpuDeviceName() const {
-#if defined(GGML_USE_KOMPUTE)
    if (usingGPUDevice()) {
-        return ggml_vk_current_device().name;
-    }
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+        return d_ptr->deviceName.c_str();
 #endif
+    }
    return nullptr;
 }

--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@ -30,7 +30,7 @@ public:
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
    bool hasGPUDevice() const override;
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -12,12 +12,21 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #ifdef _MSC_VER
 #include <intrin.h>
 #endif

+#ifndef __APPLE__
+static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
+#elif defined(__aarch64__)
+static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
+#else
+static const std::string DEFAULT_BACKENDS[] = {"cpu"};
+#endif
+
 std::string s_implementations_search_path = ".";

 #if !(defined(__x86_64__) || defined(_M_X64))
@ -86,11 +95,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    static auto* libs = new std::vector<Implementation>([] () {
        std::vector<Implementation> fres;

-        std::string impl_name_re = "(gptj|llamamodel-mainline)";
+        std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
        if (cpu_supports_avx2() == 0) {
            impl_name_re += "-avxonly";
-        } else {
-            impl_name_re += "-(default|metal)";
        }
        std::regex re(impl_name_re);
        auto search_in_directory = [&](const std::string& paths) {
@ -125,6 +132,13 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    return *libs;
 }

+static std::string applyCPUVariant(const std::string &buildVariant) {
+    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
+        return buildVariant + "-avxonly";
+    }
+    return buildVariant;
+}
+
 const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    bool buildVariantMatched = false;
    std::optional<std::string> archName;
@ -142,110 +156,124 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
    }

    if (!buildVariantMatched)
-        throw MissingImplementationError("Could not find any implementations for build variant: " + buildVariant);
+        return nullptr;
    if (!archName)
        throw UnsupportedModelError("Unsupported file format");

    throw BadArchError(std::move(*archName));
 }

-LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
-    // Get correct implementation
-    const Implementation* impl = nullptr;
-
-    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
-        if (buildVariant == "auto") {
-            size_t total_mem = getSystemTotalRAMInBytes();
-            try {
-                impl = implementation(modelPath.c_str(), "metal");
-            } catch (const std::exception &e) {
-                // fall back to CPU
-            }
-            if(impl) {
-                LLModel* metalimpl = impl->m_construct();
-                metalimpl->m_implementation = impl;
-                /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-                 * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-                 * most (all?) places where this is called, causing underestimation of required
-                 * memory. */
-                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
-                float req_to_total = (float) req_mem / (float) total_mem;
+LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
+    std::vector<std::string> desiredBackends;
+    if (backend != "auto") {
+        desiredBackends.push_back(backend);
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    for (const auto &desiredBackend: desiredBackends) {
+        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
+
+        if (impl) {
+            // Construct llmodel implementation
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+
+#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
+            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
+             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
+             * most (all?) places where this is called, causing underestimation of required
+             * memory. */
+            if (backend == "auto" && desiredBackend == "metal") {
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                if (req_to_total >= 0.53) {
-                    delete metalimpl;
-                    impl = nullptr;
-                } else {
-                    return metalimpl;
+                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
+                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
+                    delete fres;
+                    continue;
                }
            }
+#else
+            (void)n_ctx;
+#endif
+
+            return fres;
        }
-    #else
-        (void)n_ctx;
-    #endif
-
-    if (!impl) {
-        //TODO: Auto-detect CUDA/OpenCL
-        if (buildVariant == "auto") {
-            if (cpu_supports_avx2() == 0) {
-                buildVariant = "avxonly";
-            } else {
-                buildVariant = "default";
-            }
-        }
-        impl = implementation(modelPath.c_str(), buildVariant);
    }

-    // Construct and return llmodel implementation
-    auto fres = impl->m_construct();
-    fres->m_implementation = impl;
-    return fres;
+    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
 }

-LLModel *LLModel::Implementation::constructDefaultLlama() {
-    static std::unique_ptr<LLModel> llama([]() -> LLModel * {
-        const std::vector<LLModel::Implementation> *impls;
-        try {
-            impls = &implementationList();
-        } catch (const std::runtime_error &e) {
-            std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-            return nullptr;
-        }
+LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
+    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
+
+    const std::vector<Implementation> *impls;
+    try {
+        impls = &implementationList();
+    } catch (const std::runtime_error &e) {
+        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
+        return nullptr;
+    }
+
+    std::vector<std::string> desiredBackends;
+    if (backend) {
+        desiredBackends.push_back(backend.value());
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    const Implementation *impl = nullptr;
+
+    for (const auto &desiredBackend: desiredBackends) {
+        auto cacheIt = implCache.find(desiredBackend);
+        if (cacheIt != implCache.end())
+            return cacheIt->second.get(); // cached

-        const LLModel::Implementation *impl = nullptr;
        for (const auto &i: *impls) {
-            if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
-            impl = &i;
+            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
+                impl = &i;
+                break;
+            }
        }
-        if (!impl) {
-            std::cerr << __func__ << ": could not find llama.cpp implementation\n";
-            return nullptr;
+
+        if (impl) {
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
+            return fres;
        }
+    }

-        auto fres = impl->m_construct();
-        fres->m_implementation = impl;
-        return fres;
-    }());
-    return llama.get();
+    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
+    return nullptr;
 }

 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
-    auto *llama = constructDefaultLlama();
-    if (llama) { return llama->availableGPUDevices(memoryRequired); }
-    return {};
+    std::vector<LLModel::GPUDevice> devices;
+#ifndef __APPLE__
+    static const std::string backends[] = {"kompute", "cuda"};
+    for (const auto &backend: backends) {
+        auto *llama = constructGlobalLlama(backend);
+        if (llama) {
+            auto backendDevs = llama->availableGPUDevices(memoryRequired);
+            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
+        }
+    }
+#endif
+    return devices;
 }

 int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama ? llama->maxContextLength(modelPath) : -1;
 }

 int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama ? llama->layerCount(modelPath) : -1;
 }

 bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama && llama->isEmbeddingModel(modelPath);
 }

--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -1,6 +1,7 @@
 #ifndef LLMODEL_H
 #define LLMODEL_H

+#include <algorithm>
 #include <cstdint>
 #include <fstream>
 #include <functional>
@ -8,8 +9,11 @@
 #include <optional>
 #include <string>
 #include <string_view>
+#include <unordered_map>
 #include <vector>

+using namespace std::string_literals;
+
 #define LLMODEL_MAX_PROMPT_BATCH 128

 class Dlhandle;
@ -41,14 +45,35 @@ public:
    };

    struct GPUDevice {
+        const char *backend;
        int index;
        int type;
        size_t heapSize;
        std::string name;
        std::string vendor;

-        GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
-            index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
+        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
+            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
+            vendor(std::move(vendor)) {}
+
+        std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
+        std::string reportedName()  const { return name + " (" + m_backendNames.at(backend) + ")"; }
+
+        static std::string updateSelectionName(const std::string &name) {
+            if (name == "Auto" || name == "CPU" || name == "Metal")
+                return name;
+            auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
+                return name.starts_with(entry.second + ": ");
+            });
+            if (it != m_backendNames.end())
+                return name;
+            return "Vulkan: " + name; // previously, there were only Vulkan devices
+        }
+
+    private:
+        static inline const std::unordered_map<std::string, std::string> m_backendNames {
+            {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        };
    };

    class Implementation {
@ -60,7 +85,7 @@ public:
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }

-        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
+        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
@ -76,7 +101,7 @@ public:

        static const std::vector<Implementation> &implementationList();
        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructDefaultLlama();
+        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);

        char *(*m_getFileArch)(const char *fname);
        bool (*m_isArchSupported)(const char *arch);
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@ -31,10 +31,10 @@ static void llmodel_set_error(const char **errptr, const char *message) {
    }
 }

-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
    LLModel *llModel;
    try {
-        llModel = LLModel::Implementation::construct(model_path, build_variant);
+        llModel = LLModel::Implementation::construct(model_path, backend);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return nullptr;
@ -248,6 +248,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
    for (unsigned i = 0; i < devices.size(); i++) {
        const auto &dev  =   devices[i];
              auto &cdev = c_devices[i];
+        cdev.backend  = dev.backend;
        cdev.index    = dev.index;
        cdev.type     = dev.type;
        cdev.heapSize = dev.heapSize;
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@ -48,6 +48,7 @@ struct llmodel_prompt_context {
 };

 struct llmodel_gpu_device {
+    const char * backend;
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 * Embedding cancellation callback for use with llmodel_embed.
 * @param batch_sizes The number of tokens in each batch that will be embedded.
 * @param n_batch The number of batches that will be embedded.
- * @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
+ * @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
 * @return True to cancel llmodel_embed, false to continue.
 */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
 * @param model_path A string representing the path to the model file; will only be used to detect model type.
- * @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
+ * @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
 * @param error A pointer to a string; will only be set on error.
 * @return A pointer to the llmodel_model instance; NULL on error.
 */
-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);

 /**
 * Destroy a llmodel instance.
--- a/gpt4all-bindings/python/README.md
+++ b/gpt4all-bindings/python/README.md
@ -23,9 +23,9 @@ As an alternative to downloading via pip, you may build the Python bindings from

 ### Prerequisites

-On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
+You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.

-macOS users do not need Vulkan, as GPT4All will use Metal instead.
+On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

 ### Building the python bindings

--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -71,6 +71,7 @@ class LLModelPromptContext(ctypes.Structure):

 class LLModelGPUDevice(ctypes.Structure):
    _fields_ = [
+        ("backend", ctypes.c_char_p),
        ("index", ctypes.c_int32),
        ("type", ctypes.c_int32),
        ("heapSize", ctypes.c_size_t),
@ -200,9 +201,11 @@ class LLModel:
        Maximum size of context window
    ngl : int
        Number of GPU layers to use (Vulkan)
+    backend : str
+        Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
    """

-    def __init__(self, model_path: str, n_ctx: int, ngl: int):
+    def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
        self.model_path = model_path.encode()
        self.n_ctx = n_ctx
        self.ngl = ngl
@ -212,7 +215,7 @@ class LLModel:

        # Construct a model implementation
        err = ctypes.c_char_p()
-        model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
+        model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
        if model is None:
            s = err.value
            raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
@ -231,7 +234,7 @@ class LLModel:
        raise ValueError("Attempted operation on a closed LLModel")

    @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
        if self.model is None:
            self._raise_closed()
        return llmodel.llmodel_model_backend_name(self.model).decode()
@ -258,7 +261,7 @@ class LLModel:
        devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
        if not devices_ptr:
            raise ValueError("Unable to retrieve available GPU devices")
-        return [d.name.decode() for d in devices_ptr[:num_devices.value]]
+        return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]

    def init_gpu(self, device: str):
        if self.model is None:
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -5,6 +5,7 @@ from __future__ import annotations

 import hashlib
 import os
+import platform
 import re
 import sys
 import time
@ -44,7 +45,7 @@ class Embed4All:

    MIN_DIMENSIONALITY = 64

-    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
+    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
        """
        Constructor

@ -172,7 +173,7 @@ class GPT4All:
        model_type: str | None = None,
        allow_download: bool = True,
        n_threads: int | None = None,
-        device: str | None = "cpu",
+        device: str | None = None,
        n_ctx: int = 2048,
        ngl: int = 100,
        verbose: bool = False,
@ -190,30 +191,56 @@ class GPT4All:
            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
            device: The processing unit on which the GPT4All model will run. It can be set to:
                - "cpu": Model will run on the central processing unit.
-                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
-                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+                - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
+                - "kompute": Use the best GPU provided by the Kompute backend.
+                - "cuda": Use the best GPU provided by the CUDA backend.
+                - "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
                - A specific device name from the list returned by `GPT4All.list_gpus()`.
-                Default is "cpu".
+                Default is Metal on ARM64 macOS, "cpu" otherwise.

                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
            n_ctx: Maximum size of context window
            ngl: Number of GPU layers to use (Vulkan)
            verbose: If True, print debug messages.
        """
+
        self.model_type = model_type
+        self._history: list[MessageType] | None = None
+        self._current_prompt_template: str = "{0}"
+
+        device_init = None
+        if sys.platform == 'darwin':
+            if device is None:
+                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
+            elif device == 'cpu':
+                backend = 'cpu'
+            else:
+                if platform.machine() != 'arm64' or device != 'gpu':
+                    raise ValueError(f'Unknown device for this platform: {device}')
+                backend = 'metal'
+        else:
+            backend = 'kompute'
+            if device is None or device == 'cpu':
+                pass  # use kompute with no device
+            elif device in ('cuda', 'kompute'):
+                backend = device
+                device_init = 'gpu'
+            elif device.startswith('cuda:'):
+                backend = 'cuda'
+                device_init = device.removeprefix('cuda:')
+            else:
+                device_init = device.removeprefix('kompute:')
+
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = LLModel(self.config["path"], n_ctx, ngl)
-        if device is not None and device != "cpu":
-            self.model.init_gpu(device)
+        self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
+        if device_init is not None:
+            self.model.init_gpu(device_init)
        self.model.load_model()
        # Set n_threads
        if n_threads is not None:
            self.model.set_thread_count(n_threads)

-        self._history: list[MessageType] | None = None
-        self._current_prompt_template: str = "{0}"
-
    def __enter__(self) -> Self:
        return self

@ -227,13 +254,13 @@ class GPT4All:
        self.model.close()

    @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
-        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
+        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
        return self.model.backend

    @property
    def device(self) -> str | None:
-        """The name of the GPU device currently in use, or None for backends other than Kompute."""
+        """The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
        return self.model.device

    @property
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -45,7 +45,7 @@ def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
                d = os.path.join(dest_dir, item)
                shutil.copy2(s, d)
                files_copied += 1
-            if item.endswith(lib_ext) or item.endswith('.metal'):
+            if item.endswith(lib_ext) or item.endswith('.metallib'):
                s = os.path.join(dirpath, item)
                d = os.path.join(dest_build_dir, item)
                shutil.copy2(s, d)
@ -68,7 +68,7 @@ def get_long_description():

 setup(
    name=package_name,
-    version="2.6.0",
+    version="2.7.0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -17,8 +17,8 @@ if(APPLE)
 endif()

 set(APP_VERSION_MAJOR 2)
-set(APP_VERSION_MINOR 7)
-set(APP_VERSION_PATCH 6)
+set(APP_VERSION_MINOR 8)
+set(APP_VERSION_PATCH 0)
 set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")

 # Include the binary directory for the generated header file
@ -65,7 +65,7 @@ add_subdirectory(../gpt4all-backend llmodel)

 set(METAL_SHADER_FILE)
 if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
-  set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal)
+    set(METAL_SHADER_FILE ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
 endif()

 set(APP_ICON_RESOURCE)
@ -185,7 +185,6 @@ if(METAL_SHADER_FILE)
    set_target_properties(chat PROPERTIES
        RESOURCE ${METAL_SHADER_FILE}
    )
-    configure_file(${METAL_SHADER_FILE} bin/ggml-metal.metal COPYONLY)
 endif()

 target_compile_definitions(chat
@ -207,18 +206,61 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 endif()

 install(TARGETS chat DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llmodel DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+
+install(
+    TARGETS llmodel
+    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    RUNTIME DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+)

 # We should probably iterate through the list of the cmake for backend, but these need to be installed
 # to the this component's dir for the finicky qt installer to work
-install(TARGETS gptj-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS gptj-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-if(APPLE)
-install(TARGETS llamamodel-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+if (LLMODEL_KOMPUTE)
+    set(MODEL_IMPL_TARGETS
+        llamamodel-mainline-kompute
+        llamamodel-mainline-kompute-avxonly
+        gptj-kompute
+        gptj-kompute-avxonly
+    )
+else()
+    set(MODEL_IMPL_TARGETS
+        llamamodel-mainline-cpu
+        llamamodel-mainline-cpu-avxonly
+        gptj-cpu
+        gptj-cpu-avxonly
+    )
+endif()
+
+if (APPLE)
+    list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
+endif()
+
+install(
+    TARGETS ${MODEL_IMPL_TARGETS}
+    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+)
+
+if (LLMODEL_CUDA)
+    set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
+                 APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
+
+    install(
+        TARGETS llamamodel-mainline-cuda
+                llamamodel-mainline-cuda-avxonly
+        RUNTIME_DEPENDENCY_SET llama-cuda-deps
+        LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+        RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+    )
+    if (WIN32)
+        install(
+            RUNTIME_DEPENDENCY_SET llama-cuda-deps
+            PRE_EXCLUDE_REGEXES "^(nvcuda|api-ms-.*)\\.dll$"
+            POST_INCLUDE_REGEXES "(^|[/\\\\])(lib)?(cuda|cublas)" POST_EXCLUDE_REGEXES .
+            DIRECTORIES "${CUDAToolkit_BIN_DIR}"
+            DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}
+        )
+    endif()
 endif()

 set(CPACK_GENERATOR "IFW")
--- a/gpt4all-chat/build_and_run.md
+++ b/gpt4all-chat/build_and_run.md
@ -6,9 +6,9 @@ gpt4all-chat from source.

 ## Prerequisites

-On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
+You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.

-macOS users do not need Vulkan, as GPT4All will use Metal instead.
+On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

 ## Note for Linux users

--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -143,7 +143,7 @@ void ChatLLM::handleThreadStarted()

 void ChatLLM::handleForceMetalChanged(bool forceMetal)
 {
-#if defined(Q_OS_MAC) && defined(__arm__)
+#if defined(Q_OS_MAC) && defined(__aarch64__)
    m_forceMetal = forceMetal;
    if (isModelLoaded() && m_shouldBeLoaded) {
        m_reloadingToChangeVariant = true;
@ -324,19 +324,29 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
            QElapsedTimer modelLoadTimer;
            modelLoadTimer.start();

+            auto requestedDevice = MySettings::globalInstance()->device();
            auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
            m_ctx.n_ctx = n_ctx;
            auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);

-            std::string buildVariant = "auto";
-#if defined(Q_OS_MAC) && defined(__arm__)
-            if (m_forceMetal)
-                buildVariant = "metal";
+            std::string backend = "auto";
+#ifdef Q_OS_MAC
+            if (requestedDevice == "CPU") {
+                backend = "cpu";
+            } else if (m_forceMetal) {
+#ifdef __aarch64__
+                backend = "metal";
 #endif
+            }
+#else // !defined(Q_OS_MAC)
+            if (requestedDevice.startsWith("CUDA: "))
+                backend = "cuda";
+#endif
+
            QString constructError;
            m_llModelInfo.model.reset();
            try {
-                auto *model = LLModel::Implementation::construct(filePath.toStdString(), buildVariant, n_ctx);
+                auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
                m_llModelInfo.model.reset(model);
            } catch (const LLModel::MissingImplementationError &e) {
                modelLoadProps.insert("error", "missing_model_impl");
@ -378,6 +388,8 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                {
                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
                    availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
+                    // Pick the best device
+                    // NB: relies on the fact that Kompute devices are listed first
                    if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
                        defaultDevice = &availableDevices.front();
                        float memGB = defaultDevice->heapSize / float(1024 * 1024 * 1024);
@ -387,16 +399,18 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    }
                }

-                const QString requestedDevice = MySettings::globalInstance()->device();
-                bool isMetal = m_llModelInfo.model->implementation().buildVariant() == "metal";
+                QString actualDevice("CPU");

-                // Pick the best match for the device
-                QString actualDevice = isMetal ? "Metal" : "CPU";
-                if (!isMetal && requestedDevice != "CPU") {
+#if defined(Q_OS_MAC) && defined(__aarch64__)
+                if (m_llModelInfo.model->implementation().buildVariant() == "metal")
+                    actualDevice = "Metal";
+#else
+                if (requestedDevice != "CPU") {
                    const auto *device = defaultDevice;
                    if (requestedDevice != "Auto") {
+                        // Use the selected device
                        for (const LLModel::GPUDevice &d : availableDevices) {
-                            if (QString::fromStdString(d.name) == requestedDevice) {
+                            if (QString::fromStdString(d.selectionName()) == requestedDevice) {
                                device = &d;
                                break;
                            }
@ -409,14 +423,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
                        emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
                    } else {
-                        actualDevice = QString::fromStdString(device->name);
+                        actualDevice = QString::fromStdString(device->reportedName());
                        modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
                    }
                }
+#endif

                // Report which device we're actually using
                emit reportDevice(actualDevice);
-
                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);

                if (!m_shouldBeLoaded) {
--- a/gpt4all-chat/cmake/deploy-qt-linux.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-linux.cmake.in
@ -5,10 +5,7 @@ set(DATA_DIR ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN
 set(BIN_DIR ${DATA_DIR}/bin)
 set(Qt6_ROOT_DIR "@Qt6_ROOT_DIR@")
 set(ENV{LD_LIBRARY_PATH} "${BIN_DIR}:${Qt6_ROOT_DIR}/../lib/")
-execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2)
-file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
-file(COPY ${MYLLMODELLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
+execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2 -exclude-libs=libcuda.so.1)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
     DESTINATION ${DATA_DIR})
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"
--- a/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
@ -4,14 +4,11 @@ set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
 execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2)
 file(GLOB MYGPTJLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libgptj*)
 file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
-file(GLOB MYBERTLLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libbert*)
 file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
 file(COPY ${MYGPTJLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLAMALIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
-file(COPY ${MYBERTLLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLMODELLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
--- a/gpt4all-chat/cmake/deploy-qt-windows.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-windows.cmake.in
@ -2,9 +2,6 @@ set(WINDEPLOYQT "@WINDEPLOYQT@")
 set(COMPONENT_NAME_MAIN "@COMPONENT_NAME_MAIN@")
 set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
 execute_process(COMMAND ${WINDEPLOYQT} --qmldir ${CMAKE_CURRENT_SOURCE_DIR} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
-file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
-file(COPY ${MYLLMODELLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"
--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@ -65,10 +65,14 @@ MySettings::MySettings()
 {
    QSettings::setDefaultFormat(QSettings::IniFormat);

-    std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
    QVector<QString> deviceList{ "Auto" };
+#if defined(Q_OS_MAC) && defined(__aarch64__)
+    deviceList << "Metal";
+#else
+    std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
    for (LLModel::GPUDevice &d : devices)
-        deviceList << QString::fromStdString(d.name);
+        deviceList << QString::fromStdString(d.selectionName());
+#endif
    deviceList << "CPU";
    setDeviceList(deviceList);
 }
@ -786,7 +790,23 @@ QString MySettings::device() const
 {
    QSettings setting;
    setting.sync();
-    return setting.value("device", default_device).toString();
+    auto value = setting.value("device");
+    if (!value.isValid())
+        return default_device;
+
+    auto device = value.toString();
+    if (!device.isEmpty()) {
+        auto deviceStr = device.toStdString();
+        auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr);
+        if (newNameStr != deviceStr) {
+            auto newName = QString::fromStdString(newNameStr);
+            qWarning() << "updating device name:" << device << "->" << newName;
+            device = newName;
+            setting.setValue("device", device);
+            setting.sync();
+        }
+    }
+    return device;
 }

 void MySettings::setDevice(const QString &u)