cmake: fix Metal build after #2310 (#2350 )

I don't understand why this is needed, but it works. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
support the llama.cpp CUDA backend (#2310 )
65 changed files with 1662 additions and 2034 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -97,7 +97,9 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk patchelf
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk patchelf cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Installing Qt
          command: |
@ -121,6 +123,7 @@ jobs:
            set -eo pipefail
            export CMAKE_PREFIX_PATH=~/Qt/6.5.1/gcc_64/lib/cmake
            export PATH=$PATH:$HOME/Qt/Tools/QtInstallerFramework/4.7/bin
+            export PATH=$PATH:/usr/local/cuda/bin
            mkdir build
            cd build
            mkdir upload
@ -162,6 +165,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Build
          command: |
@ -218,7 +226,9 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
+            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Installing Qt
          command: |
@ -235,6 +245,7 @@ jobs:
          name: Build
          command: |
            export CMAKE_PREFIX_PATH=~/Qt/6.5.1/gcc_64/lib/cmake
+            export PATH=$PATH:/usr/local/cuda/bin
            ~/Qt/Tools/CMake/bin/cmake -DCMAKE_BUILD_TYPE=Release -S gpt4all-chat -B build
            ~/Qt/Tools/CMake/bin/cmake --build build --target all

@ -269,6 +280,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Build
          command: |
@ -394,12 +410,15 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk
+            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
            pip install setuptools wheel cmake
      - run:
          name: Build C library
          command: |
+            export PATH=$PATH:/usr/local/cuda/bin
            git submodule update --init --recursive
            cd gpt4all-backend
            cmake -B build
@ -459,6 +478,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Install dependencies
          command:
@ -530,11 +554,14 @@ jobs:
          command: |
            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list            
+            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk
+            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Build Libraries
          command: |
+            export PATH=$PATH:/usr/local/cuda/bin
            cd gpt4all-backend
            mkdir -p runtimes/build
            cd runtimes/build
@ -599,6 +626,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Install dependencies
          command: |
@ -642,6 +674,11 @@ jobs:
          command: |
            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
+      - run:
+          name: Install CUDA Toolkit
+          command: |
+            Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe -OutFile cuda_12.4.1_windows_network.exe
+            .\cuda_12.4.1_windows_network.exe -s cudart_12.4 nvcc_12.4 cublas_12.4 cublas_dev_12.4
      - run:
          name: Install dependencies
          command: |
--- a/LICENSE_SOM.txt
+++ b/LICENSE_SOM.txt
@ -1,30 +0,0 @@
-Software for Open Models License (SOM)
-Version 1.0 dated August 30th, 2023
-
-This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
-
-This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
-
-1. Definitions
-The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
-A “Model” is the output of a machine learning algorithm, and excludes the Software.
-“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
-“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
-
-2. Grant of Rights. Subject to the conditions and limitations in section 3:
-(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
-
-(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
-
-3. Conditions and Limitations
-(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
-
-(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
-
-(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
-
-(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
-
-(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
-
-(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
--- a/README.md
+++ b/README.md
@ -33,6 +33,13 @@ Learn more in the [documentation](https://docs.gpt4all.io).
 A GPT4All model is a 3GB - 8GB file that you can download and plug into the GPT4All software. **Nomic AI** supports and maintains this software ecosystem to enforce quality and security alongside spearheading the effort to allow any person or enterprise to easily deploy their own on-edge large language models.


+### Installation
+
+The recommended way to install GPT4All is to use one of the online installers linked above in this README, which are also available at the [GPT4All website](https://gpt4all.io/). These require an internet connection at install time, are slightly easier to use on macOS due to code signing, and provide a version of GPT4All that can check for updates.
+
+An alternative way to install GPT4All is to use one of the offline installers available on the [Releases page](https://github.com/nomic-ai/gpt4all/releases). These do not require an internet connection at install time, and can be used to install an older version of GPT4All if so desired. But using these requires acknowledging a security warning on macOS, and they provide a version of GPT4All that is unable to notify you of updates, so you should enable notifications for Releases on this repository (Watch > Custom > Releases) or sign up for announcements in our [Discord server](https://discord.gg/mGZE39AS3e).
+
+
 ### What's New
 - **October 19th, 2023**: GGUF Support Launches with Support for:
    - Mistral 7b base model, an updated model gallery on [gpt4all.io](https://gpt4all.io), several new local code models including Rift Coder v1.5
@ -40,7 +47,9 @@ A GPT4All model is a 3GB - 8GB file that you can download and plug into the GPT4
    - Offline build support for running old versions of the GPT4All Local LLM Chat Client.
 - **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
 - **July 2023**: Stable support for LocalDocs, a feature that allows you to privately and locally chat with your data.
- **June 28th, 2023**: Docker-based API server launches allowing inference of local LLMs from an OpenAI-compatible HTTP endpoint.
+- **June 28th, 2023**: [Docker-based API server] launches allowing inference of local LLMs from an OpenAI-compatible HTTP endpoint.
+
+[Docker-based API server]: https://github.com/nomic-ai/gpt4all/tree/cef74c2be20f5b697055d5b8b506861c7b997fab/gpt4all-api


 ### Building From Source
--- a/gpt4all-api/.gitignore
+++ b/gpt4all-api/.gitignore
@ -1,112 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-app/__pycache__/
-gpt4all_api/__pycache__/
-gpt4all_api/app/api_v1/__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# VS Code
-.vscode/
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-*.lock
-*.cache
--- a/gpt4all-api/.isort.cfg
+++ b/gpt4all-api/.isort.cfg
@ -1,7 +0,0 @@
-[settings]
-known_third_party=geopy,nltk,np,numpy,pandas,pysbd,fire,torch
-
-line_length=120
-include_trailing_comma=True
-multi_line_output=3
-use_parentheses=True
--- a/gpt4all-api/LICENSE
+++ b/gpt4all-api/LICENSE
@ -1,13 +0,0 @@
-Copyright 2023 Nomic, Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--- a/gpt4all-api/README.md
+++ b/gpt4all-api/README.md
@ -1,90 +0,0 @@
-# GPT4All REST API
-
-NOTICE: We are considering to deprecate this API as it has become challenging to maintain and test. If you have any interest in maintaining this or would like to takeover and adopt or discuss the future of this API please speak up in the discord channel.
-
-This directory contains the source code to run and build docker images that run a FastAPI app
-for serving inference from GPT4All models. The API matches the OpenAI API spec.
-
-## Tutorial
-
-The following tutorial assumes that you have checked out this repo and cd'd into it.
-
-### Starting the app
-
-First change your working directory to `gpt4all/gpt4all-api`.
-
-Now you can build the FastAPI docker image. You only have to do this on initial build or when you add new dependencies to the requirements.txt file:
-```bash
-DOCKER_BUILDKIT=1 docker build -t gpt4all_api --progress plain -f gpt4all_api/Dockerfile.buildkit .
-```
-
-Then, start the backend with:
-
-```bash
-docker compose up --build
-```
-
-This will run both the API and locally hosted GPU inference server. If you want to run the API without the GPU inference server, you can run:
-
-```bash
-docker compose up --build gpt4all_api
-```
-
-To run the API with the GPU inference server, you will need to include environment variables (like the `MODEL_ID`). Edit the `.env` file and run
-```bash
-docker compose --env-file .env up --build
-```
-
-
-#### Spinning up your app
-Run `docker compose up` to spin up the backend. Monitor the logs for errors in-case you forgot to set an environment variable above.
-
-
-#### Development
-Run
-
-```bash
-docker compose up --build
-```
-and edit files in the `app` directory. The api will hot-reload on changes.
-
-You can run the unit tests with
-
-```bash
-make test
-```
-
-#### Viewing API documentation
-
-Once the FastAPI ap is started you can access its documentation and test the search endpoint by going to:
-```
-localhost:80/docs
-```
-
-This documentation should match the OpenAI OpenAPI spec located at https://github.com/openai/openai-openapi/blob/master/openapi.yaml
-
-
-#### Running inference
-```python
-import openai
-openai.api_base = "http://localhost:4891/v1"
-
-openai.api_key = "not needed for a local LLM"
-
-
-def test_completion():
-    model = "gpt4all-j-v1.3-groovy"
-    prompt = "Who is Michael Jordan?"
-    response = openai.Completion.create(
-        model=model,
-        prompt=prompt,
-        max_tokens=50,
-        temperature=0.28,
-        top_p=0.95,
-        n=1,
-        echo=True,
-        stream=False
-    )
-    assert len(response['choices'][0]['text']) > len(prompt)
-    print(response)
-```
--- a/gpt4all-api/docker-compose.gpu.yaml
+++ b/gpt4all-api/docker-compose.gpu.yaml
@ -1,24 +0,0 @@
-version: "3.8"
-
-services:
-  gpt4all_gpu:
-    image: ghcr.io/huggingface/text-generation-inference:0.9.3
-    container_name: gpt4all_gpu
-    restart: always #restart on error (usually code compilation from save during bad state)
-    environment:
-      - HUGGING_FACE_HUB_TOKEN=token
-      - USE_FLASH_ATTENTION=false
-      - MODEL_ID=''
-      - NUM_SHARD=1
-    command: --model-id $MODEL_ID --num-shard $NUM_SHARD
-    volumes:
-      - ./:/data
-    ports:
-      - "8080:80"
-    shm_size: 1g
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: [gpu]
--- a/gpt4all-api/docker-compose.yaml
+++ b/gpt4all-api/docker-compose.yaml
@ -1,22 +0,0 @@
-version: "3.8"
-
-services:
-  gpt4all_api:
-    image: gpt4all_api
-    container_name: gpt4all_api
-    restart: always #restart on error (usually code compilation from save during bad state)
-    ports:
-      - "4891:4891"
-    env_file:
-      - .env
-    environment:
-      - APP_ENVIRONMENT=dev
-      - WEB_CONCURRENCY=2
-      - LOGLEVEL=debug
-      - PORT=4891
-      - model=${MODEL_BIN} # using variable from .env file
-      - inference_mode=cpu
-    volumes:
-      - './gpt4all_api/app:/app'
-      - './gpt4all_api/models:/models' # models are mounted in the container
-    command: ["/start-reload.sh"]
--- a/gpt4all-api/gpt4all_api/Dockerfile.buildkit
+++ b/gpt4all-api/gpt4all_api/Dockerfile.buildkit
@ -1,17 +0,0 @@
-# syntax=docker/dockerfile:1.0.0-experimental
-FROM tiangolo/uvicorn-gunicorn:python3.11
-
-# Put first so anytime this file changes other cached layers are invalidated.
-COPY gpt4all_api/requirements.txt /requirements.txt
-
-RUN pip install --upgrade pip
-
-# Run various pip install commands with ssh keys from host machine.
-RUN --mount=type=ssh pip install -r /requirements.txt && \
-  rm -Rf /root/.cache && rm -Rf /tmp/pip-install*
-
-# Finally, copy app and client.
-COPY gpt4all_api/app /app
-
-RUN mkdir -p /models
-
--- a/gpt4all-api/gpt4all_api/README.md
+++ b/gpt4all-api/gpt4all_api/README.md
@ -1 +0,0 @@
-# FastAPI app for serving GPT4All models
--- a/gpt4all-api/gpt4all_api/app/api_v1/api.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/api.py
@ -1,9 +0,0 @@
-from api_v1.routes import chat, completions, engines, health
-from fastapi import APIRouter
-
-router = APIRouter()
-
-router.include_router(chat.router)
-router.include_router(completions.router)
-router.include_router(engines.router)
-router.include_router(health.router)
--- a/gpt4all-api/gpt4all_api/app/api_v1/events.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/events.py
@ -1,29 +0,0 @@
-import logging
-
-from api_v1.settings import settings
-from fastapi import HTTPException
-from fastapi.responses import JSONResponse
-from starlette.requests import Request
-
-log = logging.getLogger(__name__)
-
-
-startup_msg_fmt = """
- Starting up GPT4All API
-"""
-
-
-async def on_http_error(request: Request, exc: HTTPException):
-    return JSONResponse({'detail': exc.detail}, status_code=exc.status_code)
-
-
-async def on_startup(app):
-    startup_msg = startup_msg_fmt.format(settings=settings)
-    log.info(startup_msg)
-
-
-def startup_event_handler(app):
-    async def start_app() -> None:
-        await on_startup(app)
-
-    return start_app
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
@ -1,103 +0,0 @@
-import logging
-import time
-from typing import List
-from uuid import uuid4
-from fastapi import APIRouter, HTTPException
-from gpt4all import GPT4All
-from pydantic import BaseModel, Field
-from api_v1.settings import settings
-from fastapi.responses import StreamingResponse
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
-class ChatCompletionMessage(BaseModel):
-    role: str
-    content: str
-
-class ChatCompletionRequest(BaseModel):
-    model: str = Field(settings.model, description='The model to generate a completion from.')
-    messages: List[ChatCompletionMessage] = Field(..., description='Messages for the chat completion.')
-    temperature: float = Field(settings.temp, description='Model temperature')
-
-class ChatCompletionChoice(BaseModel):
-    message: ChatCompletionMessage
-    index: int
-    logprobs: float
-    finish_reason: str
-
-class ChatCompletionUsage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-
-class ChatCompletionResponse(BaseModel):
-    id: str
-    object: str = 'text_completion'
-    created: int
-    model: str
-    choices: List[ChatCompletionChoice]
-    usage: ChatCompletionUsage
-
-router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])
-
-@router.post("/completions", response_model=ChatCompletionResponse)
-async def chat_completion(request: ChatCompletionRequest):
-    '''
-    Completes a GPT4All model response based on the last message in the chat.
-    '''
-    # GPU is not implemented yet
-    if settings.inference_mode == "gpu":
-        raise HTTPException(status_code=400,
-              detail=f"Not implemented yet: Can only infer in CPU mode.")
-
-    # we only support the configured model
-    if request.model != settings.model:
-        raise HTTPException(status_code=400,
-              detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
-
-    # run only of we have a message
-    if request.messages:
-        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
-
-        # format system message and conversation history correctly
-        formatted_messages = ""
-        for message in request.messages:
-            formatted_messages += f"<|im_start|>{message.role}\n{message.content}<|im_end|>\n"
-
-        # the LLM will complete the response of the assistant
-        formatted_messages += "<|im_start|>assistant\n"
-        response = model.generate(
-            prompt=formatted_messages,
-            temp=request.temperature
-            )
-
-        # the LLM may continue to hallucinate the conversation, but we want only the first response
-        # so, cut off everything after first <|im_end|>
-        index = response.find("<|im_end|>")
-        response_content = response[:index].strip()
-    else:
-        response_content = "No messages received."
-
-    # Create a chat message for the response
-    response_message = ChatCompletionMessage(role="assistant", content=response_content)
-
-    # Create a choice object with the response message
-    response_choice = ChatCompletionChoice(
-        message=response_message,
-        index=0,
-        logprobs=-1.0,  # Placeholder value
-        finish_reason="length"  # Placeholder value
-    )
-
-    # Create the response object
-    chat_response = ChatCompletionResponse(
-        id=str(uuid4()),
-        created=int(time.time()),
-        model=request.model,
-        choices=[response_choice],
-        usage=ChatCompletionUsage(prompt_tokens=0, completion_tokens=0, total_tokens=0),  # Placeholder values
-    )
-
-    return chat_response
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
@ -1,215 +0,0 @@
-import json
-from typing import List, Dict, Iterable, AsyncIterable
-import logging
-import time
-from typing import Dict, List, Union, Optional
-from uuid import uuid4
-import aiohttp
-import asyncio
-from api_v1.settings import settings
-from fastapi import APIRouter, Depends, Response, Security, status, HTTPException
-from fastapi.responses import StreamingResponse
-from gpt4all import GPT4All
-from pydantic import BaseModel, Field
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-
-### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
-
-
-class CompletionRequest(BaseModel):
-    model: str = Field(settings.model, description='The model to generate a completion from.')
-    prompt: Union[List[str], str] = Field(..., description='The prompt to begin completing from.')
-    max_tokens: int = Field(None, description='Max tokens to generate')
-    temperature: float = Field(settings.temp, description='Model temperature')
-    top_p: Optional[float] = Field(settings.top_p, description='top_p')
-    top_k: Optional[int] = Field(settings.top_k, description='top_k')
-    n: int = Field(1, description='How many completions to generate for each prompt')
-    stream: bool = Field(False, description='Stream responses')
-    repeat_penalty: float = Field(settings.repeat_penalty, description='Repeat penalty')
-
-
-class CompletionChoice(BaseModel):
-    text: str
-    index: int
-    logprobs: float
-    finish_reason: str
-
-
-class CompletionUsage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-
-
-class CompletionResponse(BaseModel):
-    id: str
-    object: str = 'text_completion'
-    created: int
-    model: str
-    choices: List[CompletionChoice]
-    usage: CompletionUsage
-
-
-class CompletionStreamResponse(BaseModel):
-    id: str
-    object: str = 'text_completion'
-    created: int
-    model: str
-    choices: List[CompletionChoice]
-
-
-router = APIRouter(prefix="/completions", tags=["Completion Endpoints"])
-
-def stream_completion(output: Iterable, base_response: CompletionStreamResponse):
-    """
-    Streams a GPT4All output to the client.
-
-    Args:
-        output: The output of GPT4All.generate(), which is an iterable of tokens.
-        base_response: The base response object, which is cloned and modified for each token.
-
-    Returns:
-        A Generator of CompletionStreamResponse objects, which are serialized to JSON Event Stream format.
-    """
-    for token in output:
-        chunk = base_response.copy()
-        chunk.choices = [dict(CompletionChoice(
-            text=token,
-            index=0,
-            logprobs=-1,
-            finish_reason=''
-        ))]
-        yield f"data: {json.dumps(dict(chunk))}\n\n"
-
-async def gpu_infer(payload, header):
-    async with aiohttp.ClientSession() as session:
-        try:
-            async with session.post(
-                settings.hf_inference_server_host, headers=header, data=json.dumps(payload)
-            ) as response:
-                resp = await response.json()
-            return resp
-
-        except aiohttp.ClientError as e:
-            # Handle client-side errors (e.g., connection error, invalid URL)
-            logger.error(f"Client error: {e}")
-        except aiohttp.ServerError as e:
-            # Handle server-side errors (e.g., internal server error)
-            logger.error(f"Server error: {e}")
-        except json.JSONDecodeError as e:
-            # Handle JSON decoding errors
-            logger.error(f"JSON decoding error: {e}")
-        except Exception as e:
-            # Handle other unexpected exceptions
-            logger.error(f"Unexpected error: {e}")
-
-@router.post("/", response_model=CompletionResponse)
-async def completions(request: CompletionRequest):
-    '''
-    Completes a GPT4All model response.
-    '''
-    if settings.inference_mode == "gpu":
-        params = request.dict(exclude={'model', 'prompt', 'max_tokens', 'n'})
-        params["max_new_tokens"] = request.max_tokens
-        params["num_return_sequences"] = request.n
-
-        header = {"Content-Type": "application/json"}
-        if isinstance(request.prompt, list):
-            tasks = []
-            for prompt in request.prompt:
-                payload = {"parameters": params}
-                payload["inputs"] = prompt
-                task = gpu_infer(payload, header)
-                tasks.append(task)
-            results = await asyncio.gather(*tasks)
-
-            choices = []
-            for response in results:
-                scores = response["scores"] if "scores" in response else -1.0
-                choices.append(
-                    dict(
-                        CompletionChoice(
-                            text=response["generated_text"], index=0, logprobs=scores, finish_reason='stop'
-                        )
-                    )
-                )
-
-            return CompletionResponse(
-                id=str(uuid4()),
-                created=time.time(),
-                model=request.model,
-                choices=choices,
-                usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
-            )
-
-        else:
-            payload = {"parameters": params}
-            # If streaming, we need to return a StreamingResponse
-            payload["inputs"] = request.prompt
-
-            resp = await gpu_infer(payload, header)
-
-            output = resp["generated_text"]
-            # this returns all logprobs
-            scores = resp["scores"] if "scores" in resp else -1.0
-
-            return CompletionResponse(
-                id=str(uuid4()),
-                created=time.time(),
-                model=request.model,
-                choices=[dict(CompletionChoice(text=output, index=0, logprobs=scores, finish_reason='stop'))],
-                usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
-            )
-
-    else:
-
-        if request.model != settings.model:
-            raise HTTPException(status_code=400,
-                                detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
-
-        if isinstance(request.prompt, list):
-            if len(request.prompt) > 1:
-                raise HTTPException(status_code=400, detail="Can only infer one inference per request in CPU mode.")
-            else:
-                request.prompt = request.prompt[0]
-
-        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
-
-        output = model.generate(prompt=request.prompt,
-                                max_tokens=request.max_tokens,
-                                streaming=request.stream,
-                                top_k=request.top_k,
-                                top_p=request.top_p,
-                                temp=request.temperature,
-                                )
-
-        # If streaming, we need to return a StreamingResponse
-        if request.stream:
-            base_chunk = CompletionStreamResponse(
-                id=str(uuid4()),
-                created=time.time(),
-                model=request.model,
-                choices=[]
-            )
-            return StreamingResponse((response for response in stream_completion(output, base_chunk)),
-                                     media_type="text/event-stream")
-        else:
-            return CompletionResponse(
-                id=str(uuid4()),
-                created=time.time(),
-                model=request.model,
-                choices=[dict(CompletionChoice(
-                    text=output,
-                    index=0,
-                    logprobs=-1,
-                    finish_reason='stop'
-                ))],
-                usage={
-                    'prompt_tokens': 0,  # TODO how to compute this?
-                    'completion_tokens': 0,
-                    'total_tokens': 0
-                }
-            )
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/embeddings.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/embeddings.py
@ -1,65 +0,0 @@
-from typing import List, Union
-from fastapi import APIRouter
-from api_v1.settings import settings
-from gpt4all import Embed4All
-from pydantic import BaseModel, Field
-
-### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
-
-
-class EmbeddingRequest(BaseModel):
-    model: str = Field(
-        settings.model, description="The model to generate an embedding from."
-    )
-    input: Union[str, List[str], List[int], List[List[int]]] = Field(
-        ..., description="Input text to embed, encoded as a string or array of tokens."
-    )
-
-
-class EmbeddingUsage(BaseModel):
-    prompt_tokens: int = 0
-    total_tokens: int = 0
-
-
-class Embedding(BaseModel):
-    index: int = 0
-    object: str = "embedding"
-    embedding: List[float]
-
-
-class EmbeddingResponse(BaseModel):
-    object: str = "list"
-    model: str
-    data: List[Embedding]
-    usage: EmbeddingUsage
-
-
-router = APIRouter(prefix="/embeddings", tags=["Embedding Endpoints"])
-
-embedder = Embed4All()
-
-
-def get_embedding(data: EmbeddingRequest) -> EmbeddingResponse:
-    """
-    Calculates the embedding for the given input using a specified model.
-
-    Args:
-        data (EmbeddingRequest): An EmbeddingRequest object containing the input data
-        and model name.
-
-    Returns:
-        EmbeddingResponse: An EmbeddingResponse object encapsulating the calculated embedding,
-        usage info, and the model name.
-    """
-    embedding = embedder.embed(data.input)
-    return EmbeddingResponse(
-        data=[Embedding(embedding=embedding)], usage=EmbeddingUsage(), model=data.model
-    )
-
-
-@router.post("/", response_model=EmbeddingResponse)
-def embeddings(data: EmbeddingRequest):
-    """
-    Creates a GPT4All embedding
-    """
-    return get_embedding(data)
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
@ -1,39 +0,0 @@
-import requests
-from fastapi import APIRouter, HTTPException
-from pydantic import BaseModel, Field
-from typing import List, Dict
-
-# Define the router for the engines module
-router = APIRouter(prefix="/engines", tags=["Search Endpoints"])
-
-# Define the models for the engines module
-class ListEnginesResponse(BaseModel):
-    data: List[Dict] = Field(..., description="All available models.")
-
-class EngineResponse(BaseModel):
-    data: List[Dict] = Field(..., description="All available models.")
-
-
-# Define the routes for the engines module
-@router.get("/", response_model=ListEnginesResponse)
-async def list_engines():
-    try:
-        response = requests.get('https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models2.json')
-        response.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code
-        engines = response.json()
-        return ListEnginesResponse(data=engines)
-    except requests.RequestException as e:
-        logger.error(f"Error fetching engine list: {e}")
-        raise HTTPException(status_code=500, detail="Error fetching engine list")
-
-# Define the routes for the engines module
-@router.get("/{engine_id}", response_model=EngineResponse)
-async def retrieve_engine(engine_id: str):
-    try:
-        # Implement logic to fetch a specific engine's details
-        # This is a placeholder, replace with your actual data retrieval logic
-        engine_details = {"id": engine_id, "name": "Engine Name", "description": "Engine Description"}
-        return EngineResponse(data=[engine_details])
-    except Exception as e:
-        logger.error(f"Error fetching engine details: {e}")
-        raise HTTPException(status_code=500, detail=f"Error fetching details for engine {engine_id}")
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
@ -1,13 +0,0 @@
-import logging
-from fastapi import APIRouter
-from fastapi.responses import JSONResponse
-
-log = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/health", tags=["Health"])
-
-
-@router.get('/', response_class=JSONResponse)
-async def health_check():
-    """Runs a health check on this instance of the API."""
-    return JSONResponse({'status': 'ok'}, headers={'Access-Control-Allow-Origin': '*'})
--- a/gpt4all-api/gpt4all_api/app/api_v1/settings.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/settings.py
@ -1,19 +0,0 @@
-from pydantic import BaseSettings
-
-
-class Settings(BaseSettings):
-    app_environment = 'dev'
-    model: str = 'ggml-mpt-7b-chat.bin'
-    gpt4all_path: str = '/models'
-    inference_mode: str = "cpu"
-    hf_inference_server_host: str = "http://gpt4all_gpu:80/generate"
-    sentry_dns: str = None
-
-    temp: float = 0.18
-    top_p: float = 1.0
-    top_k: int = 50
-    repeat_penalty: float = 1.18
-
-
-
-settings = Settings()
--- a/gpt4all-api/gpt4all_api/app/docs.py
+++ b/gpt4all-api/gpt4all_api/app/docs.py
@ -1,3 +0,0 @@
-desc = 'GPT4All API'
-
-endpoint_paths = {'health': '/health'}
--- a/gpt4all-api/gpt4all_api/app/main.py
+++ b/gpt4all-api/gpt4all_api/app/main.py
@ -1,84 +0,0 @@
-import logging
-import os
-
-import docs
-from api_v1 import events
-from api_v1.api import router as v1_router
-from api_v1.settings import settings
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.logger import logger as fastapi_logger
-from starlette.middleware.cors import CORSMiddleware
-
-logger = logging.getLogger(__name__)
-
-app = FastAPI(title='GPT4All API', description=docs.desc)
-
-# CORS Configuration (in-case you want to deploy)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["GET", "POST", "OPTIONS"],
-    allow_headers=["*"],
-)
-
-logger.info('Adding v1 endpoints..')
-
-# add v1
-app.include_router(v1_router, prefix='/v1')
-app.add_event_handler('startup', events.startup_event_handler(app))
-app.add_exception_handler(HTTPException, events.on_http_error)
-
-
-@app.on_event("startup")
-async def startup():
-    global model
-    if settings.inference_mode == "cpu":
-        logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
-        from gpt4all import GPT4All
-
-        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
-
-        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
-
-    else:
-        # is it possible to do this once the server is up?
-        ## TODO block until HF inference server is up.
-        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
-
-
-
-@app.on_event("shutdown")
-async def shutdown():
-    logger.info("Shutting down API")
-
-
-if settings.sentry_dns is not None:
-    import sentry_sdk
-
-    def traces_sampler(sampling_context):
-        if 'health' in sampling_context['transaction_context']['name']:
-            return False
-
-    sentry_sdk.init(
-        dsn=settings.sentry_dns, traces_sample_rate=0.1, traces_sampler=traces_sampler, send_default_pii=False
-    )
-
-# This is needed to get logs to show up in the app
-if "gunicorn" in os.environ.get("SERVER_SOFTWARE", ""):
-    gunicorn_error_logger = logging.getLogger("gunicorn.error")
-    gunicorn_logger = logging.getLogger("gunicorn")
-
-    root_logger = logging.getLogger()
-    fastapi_logger.setLevel(gunicorn_logger.level)
-    fastapi_logger.handlers = gunicorn_error_logger.handlers
-    root_logger.setLevel(gunicorn_logger.level)
-
-    uvicorn_logger = logging.getLogger("uvicorn.access")
-    uvicorn_logger.handlers = gunicorn_error_logger.handlers
-else:
-    # https://github.com/tiangolo/fastapi/issues/2019
-    LOG_FORMAT2 = (
-        "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
-    )
-    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
--- a/gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
+++ b/gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
@ -1,93 +0,0 @@
-"""
-Use the OpenAI python API to test gpt4all models.
-"""
-from typing import List, get_args
-import os
-from dotenv import load_dotenv
-
-import openai
-
-openai.api_base = "http://localhost:4891/v1"
-openai.api_key = "not needed for a local LLM"
-
-# Load the .env file
-env_path = 'gpt4all-api/gpt4all_api/.env'
-load_dotenv(dotenv_path=env_path)
-
-# Fetch MODEL_ID from .env file
-model_id = os.getenv('MODEL_BIN', 'default_model_id')
-embedding = os.getenv('EMBEDDING', 'default_embedding_model_id')
-print (model_id)
-print (embedding)
-
-def test_completion():
-    model = model_id
-    prompt = "Who is Michael Jordan?"
-    response = openai.Completion.create(
-        model=model, prompt=prompt, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
-    )
-    assert len(response['choices'][0]['text']) > len(prompt)
-
-def test_streaming_completion():
-    model = model_id
-    prompt = "Who is Michael Jordan?"
-    tokens = []
-    for resp in openai.Completion.create(
-            model=model,
-            prompt=prompt,
-            max_tokens=50,
-            temperature=0.28,
-            top_p=0.95,
-            n=1,
-            echo=True,
-            stream=True):
-        tokens.append(resp.choices[0].text)
-
-    assert (len(tokens) > 0)
-    assert (len("".join(tokens)) > len(prompt))
-
-# Modified test batch, problems with keyerror in response
-def test_batched_completion():
-    model = model_id  # replace with your specific model ID
-    prompt = "Who is Michael Jordan?"
-    responses = []
-
-    # Loop to create completions one at a time
-    for _ in range(3):
-        response = openai.Completion.create(
-            model=model, prompt=prompt, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
-        )
-        responses.append(response)
-
-    # Assertions to check the responses
-    for response in responses:
-        assert len(response['choices'][0]['text']) > len(prompt)
-
-    assert len(responses) == 3
-
-def test_embedding():
-    model = embedding
-    prompt = "Who is Michael Jordan?"
-    response = openai.Embedding.create(model=model, input=prompt)
-    output = response["data"][0]["embedding"]
-    args = get_args(List[float])
-
-    assert response["model"] == model
-    assert isinstance(output, list)
-    assert all(isinstance(x, args) for x in output)
-
-def test_chat_completion():
-    model = model_id
-
-    response = openai.ChatCompletion.create(
-        model=model,
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Knock knock."},
-            {"role": "assistant", "content": "Who's there?"},
-            {"role": "user", "content": "Orange."},
-            ]
-    )
-
-    assert response.choices[0].message.role == "assistant"
-    assert len(response.choices[0].message.content) > 0
--- a/gpt4all-api/gpt4all_api/env
+++ b/gpt4all-api/gpt4all_api/env
@ -1,3 +0,0 @@
-# Add your GGUF compatible model LLM here. ie: MODEL_BIN="mistral-7b-instruct-v0.1.Q4_0", rename file ".env"
-# Make sure this LLM matches the model you placed inside the models folder
-MODEL_BIN=""
--- a/gpt4all-api/gpt4all_api/models/README.md
+++ b/gpt4all-api/gpt4all_api/models/README.md
@ -1 +0,0 @@
-### Drop GGUF compatible models here, make sure it matches MODEL_BIN on your .env file
--- a/gpt4all-api/gpt4all_api/requirements.txt
+++ b/gpt4all-api/gpt4all_api/requirements.txt
@ -1,13 +0,0 @@
-aiohttp>=3.6.2
-aiofiles
-pydantic>=1.4.0,<2.0.0
-requests>=2.24.0
-ujson>=2.0.2
-fastapi>=0.95.0
-Jinja2>=3.0
-gpt4all>=1.0.0
-pytest
-openai==0.28.0
-black
-isort
-python-dotenv
--- a/gpt4all-api/makefile
+++ b/gpt4all-api/makefile
@ -1,46 +0,0 @@
-ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
-APP_NAME:=gpt4all_api
-PYTHON:=python3.8
-SHELL := /bin/bash
-
-all: dependencies
-
-fresh: clean dependencies
-
-testenv: clean_testenv test_build
-	docker compose -f docker-compose.yaml up --build
-
-testenv_gpu: clean_testenv test_build
-	docker compose -f docker-compose.yaml -f docker-compose.gpu.yaml up --build
-
-testenv_d: clean_testenv test_build
-	docker compose env up --build -d
-
-test:
-	docker compose exec $(APP_NAME) pytest -svv --disable-warnings -p no:cacheprovider /app/tests
-
-test_build:
-    DOCKER_BUILDKIT=1 docker build -t $(APP_NAME) --progress plain -f $(APP_NAME)/Dockerfile.buildkit .
-
-clean_testenv:
-	docker compose down -v
-
-fresh_testenv: clean_testenv testenv
-
-venv:
-	if [ ! -d $(ROOT_DIR)/venv ]; then $(PYTHON) -m venv $(ROOT_DIR)/venv; fi
-
-dependencies: venv
-	source $(ROOT_DIR)/venv/bin/activate; $(PYTHON) -m pip install -r $(ROOT_DIR)/$(APP_NAME)/requirements.txt
-
-clean: clean_testenv
-	# Remove existing environment
-	rm -rf $(ROOT_DIR)/venv;
-	rm -rf $(ROOT_DIR)/$(APP_NAME)/*.pyc;
-
-
-black:
-	source $(ROOT_DIR)/venv/bin/activate; black -l 120 -S --target-version py38 $(APP_NAME)
-
-isort:
-	source $(ROOT_DIR)/venv/bin/activate; isort  --ignore-whitespace --atomic -w 120 $(APP_NAME)
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.16)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

-if(APPLE)
-  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
-  if(BUILD_UNIVERSAL)
+if (APPLE)
+    option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
+else()
+    option(LLMODEL_KOMPUTE "llmodel: use Kompute"              ON)
+    option(LLMODEL_VULKAN  "llmodel: use Vulkan"               OFF)
+    option(LLMODEL_CUDA    "llmodel: use CUDA"                 ON)
+    option(LLMODEL_ROCM    "llmodel: use ROCm"                 OFF)
+endif()
+
+if (APPLE)
+  if (BUILD_UNIVERSAL)
    # Build a Universal binary on macOS
    # This requires that the found Qt library is compiled as Universal binaries.
    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
  else()
    # Build for the host architecture on macOS
-    if(NOT CMAKE_OSX_ARCHITECTURES)
+    if (NOT CMAKE_OSX_ARCHITECTURES)
      set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
    endif()
  endif()
@ -39,11 +47,35 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

+set(DIRECTORY llama.cpp-mainline)
 include(llama.cpp.cmake)

-set(BUILD_VARIANTS default avxonly)
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
+set(BUILD_VARIANTS)
+set(GPTJ_BUILD_VARIANT cpu)
+if (APPLE)
+    list(APPEND BUILD_VARIANTS metal)
+endif()
+if (LLMODEL_KOMPUTE)
+    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
+    set(GPTJ_BUILD_VARIANT kompute)
+else()
+    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
+endif()
+if (LLMODEL_VULKAN)
+    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
+endif()
+if (LLMODEL_CUDA)
+    include(CheckLanguage)
+    check_language(CUDA)
+    if (NOT CMAKE_CUDA_COMPILER)
+        message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
+    endif()
+    enable_language(CUDA)
+    list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
+endif()
+if (LLMODEL_ROCM)
+    enable_language(HIP)
+    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()

 set(CMAKE_VERBOSE_MAKEFILE ON)
@ -51,24 +83,34 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
-    if (BUILD_VARIANT STREQUAL avxonly)
-        set(GPT4ALL_ALLOW_NON_AVX NO)
+    if (BUILD_VARIANT MATCHES avxonly)
+        set(GPT4ALL_ALLOW_NON_AVX OFF)
    else()
-        set(GPT4ALL_ALLOW_NON_AVX YES)
+        set(GPT4ALL_ALLOW_NON_AVX ON)
    endif()
    set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
    set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
    set(LLAMA_FMA  ${GPT4ALL_ALLOW_NON_AVX})

-    if (BUILD_VARIANT STREQUAL metal)
-        set(LLAMA_METAL YES)
-    else()
-        set(LLAMA_METAL NO)
+    set(LLAMA_METAL   OFF)
+    set(LLAMA_KOMPUTE OFF)
+    set(LLAMA_VULKAN  OFF)
+    set(LLAMA_CUDA    OFF)
+    set(LLAMA_ROCM    OFF)
+    if (BUILD_VARIANT MATCHES metal)
+        set(LLAMA_METAL   ON)
+    elseif (BUILD_VARIANT MATCHES kompute)
+        set(LLAMA_KOMPUTE ON)
+    elseif (BUILD_VARIANT MATCHES vulkan)
+        set(LLAMA_VULKAN  ON)
+    elseif (BUILD_VARIANT MATCHES cuda)
+        set(LLAMA_CUDA    ON)
+    elseif (BUILD_VARIANT MATCHES rocm)
+        set(LLAMA_HIPBLAS ON)
    endif()

    # Include GGML
-    set(LLAMA_K_QUANTS YES)
-    include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
+    include_ggml(-mainline-${BUILD_VARIANT})

    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
@ -93,11 +135,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    prepare_target(llamamodel-mainline llama-mainline)

-    if (NOT LLAMA_METAL)
+    if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
        add_library(gptj-${BUILD_VARIANT} SHARED
            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
        prepare_target(gptj llama-mainline)
    endif()
+
+    if (BUILD_VARIANT STREQUAL cuda)
+        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
+    endif()
 endforeach()

 add_library(llmodel
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@ -786,12 +786,14 @@ const std::vector<LLModel::Token> &GPTJ::endTokens() const
 }

 const char *get_arch_name(gguf_context *ctx_gguf) {
-    std::string arch_name;
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    if (kid == -1)
+        throw std::runtime_error("key not found in model: general.architecture");
+
    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != GGUF_TYPE_STRING) {
-        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
-    }
+    if (ktype != GGUF_TYPE_STRING)
+        throw std::runtime_error("key general.architecture has wrong type");
+
    return gguf_get_val_str(ctx_gguf, kid);
 }

@ -824,7 +826,11 @@ DLL_EXPORT char *get_file_arch(const char *fname) {

    char *arch = nullptr;
    if (ctx_gguf && gguf_get_version(ctx_gguf) <= 3) {
-        arch = strdup(get_arch_name(ctx_gguf));
+        try {
+            arch = strdup(get_arch_name(ctx_gguf));
+        } catch (const std::runtime_error &) {
+            // cannot read key -> return null
+        }
    }

    gguf_free(ctx_gguf);
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +1 @@
-Subproject commit a3f03b7e793ee611c4918235d4532ee535a9530d
+Subproject commit 40bac11e427f2307305b86c322cb366bb95fcb8a
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -22,7 +22,11 @@
 #include <llama.h>
 #include <ggml.h>
 #ifdef GGML_USE_KOMPUTE
-#include <ggml-kompute.h>
+#   include <ggml-kompute.h>
+#elif GGML_USE_VULKAN
+#   include <ggml-vulkan.h>
+#elif GGML_USE_CUDA
+#   include <ggml-cuda.h>
 #endif

 using namespace std::string_literals;
@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;

 static const char * const modelType_ = "LLaMA";

+// note: same order as LLM_ARCH_NAMES in llama.cpp
 static const std::vector<const char *> KNOWN_ARCHES {
-    "baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
-    "persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
+    "llama",
+    "falcon",
+    // "grok", -- 314B parameters
+    "gpt2",
+    // "gptj", -- no inference code
+    // "gptneox", -- no inference code
+    "mpt",
+    "baichuan",
+    "starcoder",
+    // "persimmon", -- CUDA generates garbage
+    "refact",
+    "bert",
+    "nomic-bert",
+    "bloom",
+    "stablelm",
+    "qwen",
+    "qwen2",
+    "qwen2moe",
+    "phi2",
+    "phi3",
+    // "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
+    "codeshell",
+    "orion",
+    "internlm2",
+    // "minicpm", -- CUDA generates garbage
+    "gemma",
+    "starcoder2",
+    // "mamba", -- CUDA missing SSM_CONV
+    "xverse",
+    "command-r",
+    // "dbrx", -- 16x12B parameters
+    "olmo",
 };

 static const std::vector<const char *> EMBEDDING_ARCHES {
-    "bert", "nomic-bert"
+    "bert", "nomic-bert",
 };

 static bool is_embedding_arch(const std::string &arch) {
@ -105,12 +140,14 @@ static int llama_sample_top_p_top_k(
 }

 const char *get_arch_name(gguf_context *ctx_gguf) {
-    std::string arch_name;
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    if (kid == -1)
+        throw std::runtime_error("key not found in model: general.architecture");
+
    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != (GGUF_TYPE_STRING)) {
-        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
-    }
+    if (ktype != GGUF_TYPE_STRING)
+        throw std::runtime_error("key general.architecture has wrong type");
+
    return gguf_get_val_str(ctx_gguf, kid);
 }

@ -136,13 +173,20 @@ static gguf_context *load_gguf(const char *fname) {
 }

 static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
+    int32_t value = -1;
+    std::string arch;
+
    auto * ctx = load_gguf(modelPath.c_str());
    if (!ctx)
-        return -1;
-    std::string arch = get_arch_name(ctx);
+        goto cleanup;

-    int32_t value = -1;
-    if (ctx) {
+    try {
+        arch = get_arch_name(ctx);
+    } catch (const std::runtime_error &) {
+        goto cleanup; // cannot read key
+    }
+
+    {
        auto key = arch + "." + archKey;
        int keyidx = gguf_find_key(ctx, key.c_str());
        if (keyidx != -1) {
@ -152,6 +196,7 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
        }
    }

+cleanup:
    gguf_free(ctx);
    return value;
 }
@ -160,6 +205,7 @@ struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded = false;
    int device = -1;
+    std::string deviceName;
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    llama_model_params model_params;
@ -244,15 +290,26 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const {
 }

 bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const {
+    bool result = false;
+    std::string arch;
+
    auto *ctx_gguf = load_gguf(modelPath.c_str());
    if (!ctx_gguf) {
        std::cerr << __func__ << ": failed to load GGUF from " <<  modelPath << "\n";
-        return false;
+        goto cleanup;
+    }
+
+    try {
+        arch = get_arch_name(ctx_gguf);
+    } catch (const std::runtime_error &) {
+        goto cleanup; // cannot read key
    }

-    std::string arch = get_arch_name(ctx_gguf);
+    result = is_embedding_arch(arch);
+
+cleanup:
    gguf_free(ctx_gguf);
-    return is_embedding_arch(arch);
+    return result;
 }

 bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
@ -292,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)

    d_ptr->backend_name = "cpu"; // default

-#ifdef GGML_USE_KOMPUTE
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    if (d_ptr->device != -1) {
        d_ptr->model_params.main_gpu = d_ptr->device;
        d_ptr->model_params.n_gpu_layers = ngl;
+        d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
    }
 #elif defined(GGML_USE_METAL)
    (void)ngl;
@ -316,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    if (!d_ptr->model) {
        fflush(stdout);
        d_ptr->device = -1;
+        d_ptr->deviceName.clear();
        std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
        return false;
    }
@ -358,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        llama_free_model(d_ptr->model);
        d_ptr->model = nullptr;
        d_ptr->device = -1;
+        d_ptr->deviceName.clear();
        return false;
    }

    d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};

-#ifdef GGML_USE_KOMPUTE
    if (usingGPUDevice()) {
+#ifdef GGML_USE_KOMPUTE
        if (llama_verbose()) {
-            std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+            std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
        }
        d_ptr->backend_name = "kompute";
-    }
+#elif defined(GGML_USE_VULKAN)
+        d_ptr->backend_name = "vulkan";
+#elif defined(GGML_USE_CUDA)
+        d_ptr->backend_name = "cuda";
 #endif
+    }

    m_supportsEmbedding = isEmbedding;
    m_supportsCompletion = !isEmbedding;
@ -431,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::

 std::string LLamaModel::tokenToString(Token id) const
 {
-    return llama_token_to_piece(d_ptr->ctx, id);
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
+        GGML_ASSERT(check == -n_tokens);
+    }
+    else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
 }

 LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
@ -496,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
    return get_arch_key_u32(modelPath, "block_count");
 }

+#ifdef GGML_USE_VULKAN
+static const char *getVulkanVendorName(uint32_t vendorID) {
+    switch (vendorID) {
+        case 0x10DE: return "nvidia";
+        case 0x1002: return "amd";
+        case 0x8086: return "intel";
+        default:     return "unknown";
+    }
+}
+#endif
+
 std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
 {
-#ifdef GGML_USE_KOMPUTE
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    size_t count = 0;
-    auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);

-    if (vkDevices) {
+#ifdef GGML_USE_KOMPUTE
+    auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
+#elif defined(GGML_USE_VULKAN)
+    (void)memoryRequired; // hasn't been used since GGUF was added
+    auto *lcppDevices = ggml_vk_available_devices(&count);
+#else // defined(GGML_USE_CUDA)
+    (void)memoryRequired;
+    auto *lcppDevices = ggml_cuda_available_devices(&count);
+#endif
+
+    if (lcppDevices) {
        std::vector<LLModel::GPUDevice> devices;
        devices.reserve(count);

        for (size_t i = 0; i < count; ++i) {
-            auto & dev = vkDevices[i];
+            auto & dev = lcppDevices[i];
+
            devices.emplace_back(
+#ifdef GGML_USE_KOMPUTE
+                /* backend  = */ "kompute",
                /* index    = */ dev.index,
                /* type     = */ dev.type,
                /* heapSize = */ dev.heapSize,
                /* name     = */ dev.name,
                /* vendor   = */ dev.vendor
+#elif defined(GGML_USE_VULKAN)
+                /* backend  = */ "vulkan",
+                /* index    = */ dev.index,
+                /* type     = */ dev.type,
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ getVulkanVendorName(dev.vendorID)
+#else // defined(GGML_USE_CUDA)
+                /* backend  = */ "cuda",
+                /* index    = */ dev.index,
+                /* type     = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ "nvidia"
+#endif
            );
+
+#ifndef GGML_USE_CUDA
            ggml_vk_device_destroy(&dev);
+#else
+            ggml_cuda_device_destroy(&dev);
+#endif
        }

-        free(vkDevices);
+        free(lcppDevices);
        return devices;
    }
 #else
    (void)memoryRequired;
-    std::cerr << __func__ << ": built without Kompute\n";
+    std::cerr << __func__ << ": built without a GPU backend\n";
 #endif

    return {};
@ -531,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq

 bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    auto devices = availableGPUDevices(memoryRequired);
+
+    auto dev_it = devices.begin();
+#ifndef GGML_USE_CUDA
+    if (name == "amd" || name == "nvidia" || name == "intel") {
+        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
+    } else
+#endif
+    if (name != "gpu") {
+        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
+    }
+
+    if (dev_it < devices.end()) {
+        d_ptr->device     = dev_it->index;
+        d_ptr->deviceName = dev_it->name;
+        return true;
+    }
+    return false;
+#elif defined(GGML_USE_KOMPUTE)
    ggml_vk_device device;
    bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
    if (ok) {
        d_ptr->device = device.index;
+        d_ptr->deviceName = device.name;
+        ggml_vk_device_destroy(&device);
        return true;
    }
 #else
@ -547,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n

 bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    (void)unavail_reason;
+    auto devices = availableGPUDevices();
+    auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
    d_ptr->device = device;
+    d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
    return true;
 #else
    (void)device;
    if (unavail_reason) {
-        *unavail_reason = "built without Kompute";
+        *unavail_reason = "built without a GPU backend";
    }
    return false;
 #endif
@ -562,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co

 bool LLamaModel::hasGPUDevice() const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    return d_ptr->device != -1;
 #else
    return false;
@ -571,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const

 bool LLamaModel::usingGPUDevice() const
 {
-#if defined(GGML_USE_KOMPUTE)
-    bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+    bool hasDevice;
+
+#ifdef GGML_USE_KOMPUTE
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
    assert(!hasDevice || ggml_vk_has_device());
-    return hasDevice;
+#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
 #elif defined(GGML_USE_METAL)
-    return true;
+    hasDevice = true;
 #else
-    return false;
+    hasDevice = false;
 #endif
+
+    return hasDevice;
 }

 const char *LLamaModel::backendName() const {
@ -587,11 +734,11 @@ const char *LLamaModel::backendName() const {
 }

 const char *LLamaModel::gpuDeviceName() const {
-#if defined(GGML_USE_KOMPUTE)
    if (usingGPUDevice()) {
-        return ggml_vk_current_device().name;
-    }
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+        return d_ptr->deviceName.c_str();
 #endif
+    }
    return nullptr;
 }

@ -940,6 +1087,8 @@ void LLamaModel::embedInternal(
    }

    if (tokenCount) { *tokenCount = totalTokens; }
+
+    llama_batch_free(batch);
 }

 #if defined(_WIN32)
@ -962,16 +1111,26 @@ DLL_EXPORT const char *get_build_variant() {
 }

 DLL_EXPORT char *get_file_arch(const char *fname) {
-    auto *ctx = load_gguf(fname);
    char *arch = nullptr;
-    if (ctx) {
-        std::string archStr = get_arch_name(ctx);
-        if (is_embedding_arch(archStr) && gguf_find_key(ctx, (archStr + ".pooling_type").c_str()) < 0) {
-            // old bert.cpp embedding model
-        } else {
-            arch = strdup(archStr.c_str());
-        }
+    std::string archStr;
+
+    auto *ctx = load_gguf(fname);
+    if (!ctx)
+        goto cleanup;
+
+    try {
+        archStr = get_arch_name(ctx);
+    } catch (const std::runtime_error &) {
+        goto cleanup; // cannot read key
    }
+
+    if (is_embedding_arch(archStr) && gguf_find_key(ctx, (archStr + ".pooling_type").c_str()) < 0) {
+        // old bert.cpp embedding model
+    } else {
+        arch = strdup(archStr.c_str());
+    }
+
+cleanup:
    gguf_free(ctx);
    return arch;
 }
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@ -30,7 +30,7 @@ public:
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
    bool hasGPUDevice() const override;
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -12,12 +12,21 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #ifdef _MSC_VER
 #include <intrin.h>
 #endif

+#ifndef __APPLE__
+static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
+#elif defined(__aarch64__)
+static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
+#else
+static const std::string DEFAULT_BACKENDS[] = {"cpu"};
+#endif
+
 std::string s_implementations_search_path = ".";

 #if !(defined(__x86_64__) || defined(_M_X64))
@ -86,11 +95,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    static auto* libs = new std::vector<Implementation>([] () {
        std::vector<Implementation> fres;

-        std::string impl_name_re = "(gptj|llamamodel-mainline)";
+        std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
        if (cpu_supports_avx2() == 0) {
            impl_name_re += "-avxonly";
-        } else {
-            impl_name_re += "-(default|metal)";
        }
        std::regex re(impl_name_re);
        auto search_in_directory = [&](const std::string& paths) {
@ -125,6 +132,13 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    return *libs;
 }

+static std::string applyCPUVariant(const std::string &buildVariant) {
+    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
+        return buildVariant + "-avxonly";
+    }
+    return buildVariant;
+}
+
 const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    bool buildVariantMatched = false;
    std::optional<std::string> archName;
@ -142,110 +156,124 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
    }

    if (!buildVariantMatched)
-        throw MissingImplementationError("Could not find any implementations for build variant: " + buildVariant);
+        return nullptr;
    if (!archName)
        throw UnsupportedModelError("Unsupported file format");

    throw BadArchError(std::move(*archName));
 }

-LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
-    // Get correct implementation
-    const Implementation* impl = nullptr;
-
-    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
-        if (buildVariant == "auto") {
-            size_t total_mem = getSystemTotalRAMInBytes();
-            try {
-                impl = implementation(modelPath.c_str(), "metal");
-            } catch (const std::exception &e) {
-                // fall back to CPU
-            }
-            if(impl) {
-                LLModel* metalimpl = impl->m_construct();
-                metalimpl->m_implementation = impl;
-                /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-                 * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-                 * most (all?) places where this is called, causing underestimation of required
-                 * memory. */
-                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
-                float req_to_total = (float) req_mem / (float) total_mem;
+LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
+    std::vector<std::string> desiredBackends;
+    if (backend != "auto") {
+        desiredBackends.push_back(backend);
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    for (const auto &desiredBackend: desiredBackends) {
+        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
+
+        if (impl) {
+            // Construct llmodel implementation
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+
+#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
+            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
+             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
+             * most (all?) places where this is called, causing underestimation of required
+             * memory. */
+            if (backend == "auto" && desiredBackend == "metal") {
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                if (req_to_total >= 0.53) {
-                    delete metalimpl;
-                    impl = nullptr;
-                } else {
-                    return metalimpl;
+                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
+                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
+                    delete fres;
+                    continue;
                }
            }
+#else
+            (void)n_ctx;
+#endif
+
+            return fres;
        }
-    #else
-        (void)n_ctx;
-    #endif
-
-    if (!impl) {
-        //TODO: Auto-detect CUDA/OpenCL
-        if (buildVariant == "auto") {
-            if (cpu_supports_avx2() == 0) {
-                buildVariant = "avxonly";
-            } else {
-                buildVariant = "default";
-            }
-        }
-        impl = implementation(modelPath.c_str(), buildVariant);
    }

-    // Construct and return llmodel implementation
-    auto fres = impl->m_construct();
-    fres->m_implementation = impl;
-    return fres;
+    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
 }

-LLModel *LLModel::Implementation::constructDefaultLlama() {
-    static std::unique_ptr<LLModel> llama([]() -> LLModel * {
-        const std::vector<LLModel::Implementation> *impls;
-        try {
-            impls = &implementationList();
-        } catch (const std::runtime_error &e) {
-            std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-            return nullptr;
-        }
+LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
+    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
+
+    const std::vector<Implementation> *impls;
+    try {
+        impls = &implementationList();
+    } catch (const std::runtime_error &e) {
+        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
+        return nullptr;
+    }
+
+    std::vector<std::string> desiredBackends;
+    if (backend) {
+        desiredBackends.push_back(backend.value());
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    const Implementation *impl = nullptr;
+
+    for (const auto &desiredBackend: desiredBackends) {
+        auto cacheIt = implCache.find(desiredBackend);
+        if (cacheIt != implCache.end())
+            return cacheIt->second.get(); // cached

-        const LLModel::Implementation *impl = nullptr;
        for (const auto &i: *impls) {
-            if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
-            impl = &i;
+            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
+                impl = &i;
+                break;
+            }
        }
-        if (!impl) {
-            std::cerr << __func__ << ": could not find llama.cpp implementation\n";
-            return nullptr;
+
+        if (impl) {
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
+            return fres;
        }
+    }

-        auto fres = impl->m_construct();
-        fres->m_implementation = impl;
-        return fres;
-    }());
-    return llama.get();
+    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
+    return nullptr;
 }

 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
-    auto *llama = constructDefaultLlama();
-    if (llama) { return llama->availableGPUDevices(memoryRequired); }
-    return {};
+    std::vector<LLModel::GPUDevice> devices;
+#ifndef __APPLE__
+    static const std::string backends[] = {"kompute", "cuda"};
+    for (const auto &backend: backends) {
+        auto *llama = constructGlobalLlama(backend);
+        if (llama) {
+            auto backendDevs = llama->availableGPUDevices(memoryRequired);
+            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
+        }
+    }
+#endif
+    return devices;
 }

 int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama ? llama->maxContextLength(modelPath) : -1;
 }

 int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama ? llama->layerCount(modelPath) : -1;
 }

 bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama && llama->isEmbeddingModel(modelPath);
 }

--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -1,6 +1,7 @@
 #ifndef LLMODEL_H
 #define LLMODEL_H

+#include <algorithm>
 #include <cstdint>
 #include <fstream>
 #include <functional>
@ -8,8 +9,11 @@
 #include <optional>
 #include <string>
 #include <string_view>
+#include <unordered_map>
 #include <vector>

+using namespace std::string_literals;
+
 #define LLMODEL_MAX_PROMPT_BATCH 128

 class Dlhandle;
@ -41,14 +45,35 @@ public:
    };

    struct GPUDevice {
+        const char *backend;
        int index;
        int type;
        size_t heapSize;
        std::string name;
        std::string vendor;

-        GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
-            index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
+        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
+            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
+            vendor(std::move(vendor)) {}
+
+        std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
+        std::string reportedName()  const { return name + " (" + m_backendNames.at(backend) + ")"; }
+
+        static std::string updateSelectionName(const std::string &name) {
+            if (name == "Auto" || name == "CPU" || name == "Metal")
+                return name;
+            auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
+                return name.starts_with(entry.second + ": ");
+            });
+            if (it != m_backendNames.end())
+                return name;
+            return "Vulkan: " + name; // previously, there were only Vulkan devices
+        }
+
+    private:
+        static inline const std::unordered_map<std::string, std::string> m_backendNames {
+            {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        };
    };

    class Implementation {
@ -60,7 +85,7 @@ public:
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }

-        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
+        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
@ -76,7 +101,7 @@ public:

        static const std::vector<Implementation> &implementationList();
        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructDefaultLlama();
+        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);

        char *(*m_getFileArch)(const char *fname);
        bool (*m_isArchSupported)(const char *arch);
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@ -31,10 +31,10 @@ static void llmodel_set_error(const char **errptr, const char *message) {
    }
 }

-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
    LLModel *llModel;
    try {
-        llModel = LLModel::Implementation::construct(model_path, build_variant);
+        llModel = LLModel::Implementation::construct(model_path, backend);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return nullptr;
@ -248,6 +248,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
    for (unsigned i = 0; i < devices.size(); i++) {
        const auto &dev  =   devices[i];
              auto &cdev = c_devices[i];
+        cdev.backend  = dev.backend;
        cdev.index    = dev.index;
        cdev.type     = dev.type;
        cdev.heapSize = dev.heapSize;
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@ -48,6 +48,7 @@ struct llmodel_prompt_context {
 };

 struct llmodel_gpu_device {
+    const char * backend;
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 * Embedding cancellation callback for use with llmodel_embed.
 * @param batch_sizes The number of tokens in each batch that will be embedded.
 * @param n_batch The number of batches that will be embedded.
- * @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
+ * @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
 * @return True to cancel llmodel_embed, false to continue.
 */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
 * @param model_path A string representing the path to the model file; will only be used to detect model type.
- * @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
+ * @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
 * @param error A pointer to a string; will only be set on error.
 * @return A pointer to the llmodel_model instance; NULL on error.
 */
-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);

 /**
 * Destroy a llmodel instance.
--- a/gpt4all-bindings/python/README.md
+++ b/gpt4all-bindings/python/README.md
@ -23,9 +23,9 @@ As an alternative to downloading via pip, you may build the Python bindings from

 ### Prerequisites

-On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
+You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.

-macOS users do not need Vulkan, as GPT4All will use Metal instead.
+On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

 ### Building the python bindings

--- a/gpt4all-bindings/python/docs/index.md
+++ b/gpt4all-bindings/python/docs/index.md
@ -26,7 +26,6 @@ is organized as a monorepo with the following structure:
 - **gpt4all-backend**: The GPT4All backend maintains and exposes a universal, performance optimized C API for running inference with multi-billion parameter Transformer Decoders.
 This C API is then bound to any higher level programming language such as C++, Python, Go, etc.
 - **gpt4all-bindings**: GPT4All bindings contain a variety of high-level programming languages that implement the C API. Each directory is a bound programming language. The [CLI](gpt4all_cli.md) is included here, as well.
- **gpt4all-api**: The GPT4All API (under initial development) exposes REST API endpoints for gathering completions and embeddings from large language models.
 - **gpt4all-chat**: GPT4All Chat is an OS native chat application that runs on macOS, Windows and Linux. It is the easiest way to run local, privacy aware chat assistants on everyday hardware. You can download it on the [GPT4All Website](https://gpt4all.io) and read its source code in the monorepo.

 Explore detailed documentation for the backend, bindings and chat client in the sidebar.
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -71,6 +71,7 @@ class LLModelPromptContext(ctypes.Structure):

 class LLModelGPUDevice(ctypes.Structure):
    _fields_ = [
+        ("backend", ctypes.c_char_p),
        ("index", ctypes.c_int32),
        ("type", ctypes.c_int32),
        ("heapSize", ctypes.c_size_t),
@ -200,9 +201,11 @@ class LLModel:
        Maximum size of context window
    ngl : int
        Number of GPU layers to use (Vulkan)
+    backend : str
+        Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
    """

-    def __init__(self, model_path: str, n_ctx: int, ngl: int):
+    def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
        self.model_path = model_path.encode()
        self.n_ctx = n_ctx
        self.ngl = ngl
@ -212,7 +215,7 @@ class LLModel:

        # Construct a model implementation
        err = ctypes.c_char_p()
-        model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
+        model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
        if model is None:
            s = err.value
            raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
@ -231,7 +234,7 @@ class LLModel:
        raise ValueError("Attempted operation on a closed LLModel")

    @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
        if self.model is None:
            self._raise_closed()
        return llmodel.llmodel_model_backend_name(self.model).decode()
@ -258,7 +261,7 @@ class LLModel:
        devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
        if not devices_ptr:
            raise ValueError("Unable to retrieve available GPU devices")
-        return [d.name.decode() for d in devices_ptr[:num_devices.value]]
+        return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]

    def init_gpu(self, device: str):
        if self.model is None:
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -5,6 +5,7 @@ from __future__ import annotations

 import hashlib
 import os
+import platform
 import re
 import sys
 import time
@ -44,7 +45,7 @@ class Embed4All:

    MIN_DIMENSIONALITY = 64

-    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
+    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
        """
        Constructor

@ -172,7 +173,7 @@ class GPT4All:
        model_type: str | None = None,
        allow_download: bool = True,
        n_threads: int | None = None,
-        device: str | None = "cpu",
+        device: str | None = None,
        n_ctx: int = 2048,
        ngl: int = 100,
        verbose: bool = False,
@ -190,30 +191,56 @@ class GPT4All:
            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
            device: The processing unit on which the GPT4All model will run. It can be set to:
                - "cpu": Model will run on the central processing unit.
-                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
-                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+                - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
+                - "kompute": Use the best GPU provided by the Kompute backend.
+                - "cuda": Use the best GPU provided by the CUDA backend.
+                - "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
                - A specific device name from the list returned by `GPT4All.list_gpus()`.
-                Default is "cpu".
+                Default is Metal on ARM64 macOS, "cpu" otherwise.

                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
            n_ctx: Maximum size of context window
            ngl: Number of GPU layers to use (Vulkan)
            verbose: If True, print debug messages.
        """
+
        self.model_type = model_type
+        self._history: list[MessageType] | None = None
+        self._current_prompt_template: str = "{0}"
+
+        device_init = None
+        if sys.platform == 'darwin':
+            if device is None:
+                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
+            elif device == 'cpu':
+                backend = 'cpu'
+            else:
+                if platform.machine() != 'arm64' or device != 'gpu':
+                    raise ValueError(f'Unknown device for this platform: {device}')
+                backend = 'metal'
+        else:
+            backend = 'kompute'
+            if device is None or device == 'cpu':
+                pass  # use kompute with no device
+            elif device in ('cuda', 'kompute'):
+                backend = device
+                device_init = 'gpu'
+            elif device.startswith('cuda:'):
+                backend = 'cuda'
+                device_init = device.removeprefix('cuda:')
+            else:
+                device_init = device.removeprefix('kompute:')
+
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = LLModel(self.config["path"], n_ctx, ngl)
-        if device is not None and device != "cpu":
-            self.model.init_gpu(device)
+        self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
+        if device_init is not None:
+            self.model.init_gpu(device_init)
        self.model.load_model()
        # Set n_threads
        if n_threads is not None:
            self.model.set_thread_count(n_threads)

-        self._history: list[MessageType] | None = None
-        self._current_prompt_template: str = "{0}"
-
    def __enter__(self) -> Self:
        return self

@ -227,13 +254,13 @@ class GPT4All:
        self.model.close()

    @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
-        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
+        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
        return self.model.backend

    @property
    def device(self) -> str | None:
-        """The name of the GPU device currently in use, or None for backends other than Kompute."""
+        """The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
        return self.model.device

    @property
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -45,7 +45,7 @@ def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
                d = os.path.join(dest_dir, item)
                shutil.copy2(s, d)
                files_copied += 1
-            if item.endswith(lib_ext) or item.endswith('.metal'):
+            if item.endswith(lib_ext) or item.endswith('.metallib'):
                s = os.path.join(dirpath, item)
                d = os.path.join(dest_build_dir, item)
                shutil.copy2(s, d)
@ -68,7 +68,7 @@ def get_long_description():

 setup(
    name=package_name,
-    version="2.6.0",
+    version="2.7.0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -17,8 +17,8 @@ if(APPLE)
 endif()

 set(APP_VERSION_MAJOR 2)
-set(APP_VERSION_MINOR 7)
-set(APP_VERSION_PATCH 6)
+set(APP_VERSION_MINOR 8)
+set(APP_VERSION_PATCH 0)
 set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")

 # Include the binary directory for the generated header file
@ -65,12 +65,24 @@ add_subdirectory(../gpt4all-backend llmodel)

 set(METAL_SHADER_FILE)
 if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
-  set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal)
+    set(METAL_SHADER_FILE ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
+endif()
+
+set(APP_ICON_RESOURCE)
+if (WIN32)
+    set(APP_ICON_RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.rc")
+elseif (APPLE)
+    # The MACOSX_BUNDLE_ICON_FILE variable is added to the Info.plist
+    # generated by CMake. This variable contains the .icns file name,
+    # without the path.
+    set(MACOSX_BUNDLE_ICON_FILE gpt4all.icns)
+
+    # And the following tells CMake where to find and install the file itself.
+    set(APP_ICON_RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
+    set_source_files_properties(${APP_ICON_RESOURCE} PROPERTIES
+        MACOSX_PACKAGE_LOCATION "Resources")
 endif()

-set(APP_ICON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.icns")
-set_source_files_properties(${APP_ICON_FILE} PROPERTIES
-    MACOSX_PACKAGE_LOCATION "Resources")

 qt_add_executable(chat
    main.cpp
@ -91,7 +103,7 @@ qt_add_executable(chat
    logger.h logger.cpp
    responsetext.h responsetext.cpp
    ${METAL_SHADER_FILE}
-    ${APP_ICON_FILE}
+    ${APP_ICON_RESOURCE}
 )

 qt_add_qml_module(chat
@ -153,8 +165,6 @@ qt_add_qml_module(chat
      icons/logo.svg
      icons/logo-32.png
      icons/logo-48.png
-      icons/favicon.ico
-      icons/favicon.icns
 )

 set_target_properties(chat PROPERTIES
@ -163,7 +173,6 @@ set_target_properties(chat PROPERTIES
    MACOSX_BUNDLE_SHORT_VERSION_STRING ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}
    MACOSX_BUNDLE TRUE
    WIN32_EXECUTABLE TRUE
-    MACOSX_BUNDLE_ICON_FILE "favicon.icns"
 )

 if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
@ -176,7 +185,7 @@ if(METAL_SHADER_FILE)
    set_target_properties(chat PROPERTIES
        RESOURCE ${METAL_SHADER_FILE}
    )
-    configure_file(${METAL_SHADER_FILE} bin/ggml-metal.metal COPYONLY)
+    add_dependencies(chat ggml-metal)
 endif()

 target_compile_definitions(chat
@ -198,18 +207,61 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 endif()

 install(TARGETS chat DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llmodel DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+
+install(
+    TARGETS llmodel
+    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    RUNTIME DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+)

 # We should probably iterate through the list of the cmake for backend, but these need to be installed
 # to the this component's dir for the finicky qt installer to work
-install(TARGETS gptj-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS gptj-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-if(APPLE)
-install(TARGETS llamamodel-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+if (LLMODEL_KOMPUTE)
+    set(MODEL_IMPL_TARGETS
+        llamamodel-mainline-kompute
+        llamamodel-mainline-kompute-avxonly
+        gptj-kompute
+        gptj-kompute-avxonly
+    )
+else()
+    set(MODEL_IMPL_TARGETS
+        llamamodel-mainline-cpu
+        llamamodel-mainline-cpu-avxonly
+        gptj-cpu
+        gptj-cpu-avxonly
+    )
+endif()
+
+if (APPLE)
+    list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
+endif()
+
+install(
+    TARGETS ${MODEL_IMPL_TARGETS}
+    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+)
+
+if (LLMODEL_CUDA)
+    set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
+                 APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
+
+    install(
+        TARGETS llamamodel-mainline-cuda
+                llamamodel-mainline-cuda-avxonly
+        RUNTIME_DEPENDENCY_SET llama-cuda-deps
+        LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+        RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+    )
+    if (WIN32)
+        install(
+            RUNTIME_DEPENDENCY_SET llama-cuda-deps
+            PRE_EXCLUDE_REGEXES "^(nvcuda|api-ms-.*)\\.dll$"
+            POST_INCLUDE_REGEXES "(^|[/\\\\])(lib)?(cuda|cublas)" POST_EXCLUDE_REGEXES .
+            DIRECTORIES "${CUDAToolkit_BIN_DIR}"
+            DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}
+        )
+    endif()
 endif()

 set(CPACK_GENERATOR "IFW")
@ -230,7 +282,7 @@ elseif(${CMAKE_SYSTEM_NAME} MATCHES Windows)
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-windows.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-windows.cmake)
    set(CPACK_IFW_ROOT "C:/Qt/Tools/QtInstallerFramework/4.6")
-    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.ico")
+    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.ico")
    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64")
    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@\\${COMPONENT_NAME_MAIN}")
 elseif(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
@ -239,11 +291,11 @@ elseif(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-mac.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-mac.cmake)
    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
-    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.icns")
+    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-darwin")
    set(CPACK_IFW_TARGET_DIRECTORY "@ApplicationsDir@/${COMPONENT_NAME_MAIN}")
    set(CPACK_BUNDLE_NAME ${COMPONENT_NAME_MAIN})
-    set(CPACK_BUNDLE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.icns")
+    set(CPACK_BUNDLE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
 endif()

 set(CPACK_PACKAGE_INSTALL_DIRECTORY ${COMPONENT_NAME_MAIN})
--- a/gpt4all-chat/build_and_run.md
+++ b/gpt4all-chat/build_and_run.md
@ -6,9 +6,9 @@ gpt4all-chat from source.

 ## Prerequisites

-On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
+You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.

-macOS users do not need Vulkan, as GPT4All will use Metal instead.
+On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

 ## Note for Linux users

--- a/gpt4all-chat/chat.cpp
+++ b/gpt4all-chat/chat.cpp
@ -54,7 +54,7 @@ void Chat::connectLLM()
    connect(m_llmodel, &ChatLLM::reportFallbackReason, this, &Chat::handleFallbackReasonChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection);
-    connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::trySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection);
+    connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::handleTrySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection);

    connect(this, &Chat::promptRequested, m_llmodel, &ChatLLM::prompt, Qt::QueuedConnection);
    connect(this, &Chat::modelChangeRequested, m_llmodel, &ChatLLM::modelChangeRequested, Qt::QueuedConnection);
@ -95,16 +95,6 @@ void Chat::processSystemPrompt()
    emit processSystemPromptRequested();
 }

-bool Chat::isModelLoaded() const
-{
-    return m_modelLoadingPercentage == 1.0f;
-}
-
-float Chat::modelLoadingPercentage() const
-{
-    return m_modelLoadingPercentage;
-}
-
 void Chat::resetResponseState()
 {
    if (m_responseInProgress && m_responseState == Chat::LocalDocsRetrieval)
@ -167,9 +157,16 @@ void Chat::handleModelLoadingPercentageChanged(float loadingPercentage)
    if (loadingPercentage == m_modelLoadingPercentage)
        return;

+    bool wasLoading = isCurrentlyLoading();
+    bool wasLoaded = isModelLoaded();
+
    m_modelLoadingPercentage = loadingPercentage;
    emit modelLoadingPercentageChanged();
-    if (m_modelLoadingPercentage == 1.0f || m_modelLoadingPercentage == 0.0f)
+
+    if (isCurrentlyLoading() != wasLoading)
+        emit isCurrentlyLoadingChanged();
+
+    if (isModelLoaded() != wasLoaded)
        emit isModelLoadedChanged();
 }

@ -247,10 +244,6 @@ void Chat::setModelInfo(const ModelInfo &modelInfo)
    if (m_modelInfo == modelInfo && isModelLoaded())
        return;

-    m_modelLoadingPercentage = std::numeric_limits<float>::min(); // small non-zero positive value
-    emit isModelLoadedChanged();
-    m_modelLoadingError = QString();
-    emit modelLoadingErrorChanged();
    m_modelInfo = modelInfo;
    emit modelInfoChanged();
    emit modelChangeRequested(modelInfo);
@ -320,8 +313,9 @@ void Chat::forceReloadModel()

 void Chat::trySwitchContextOfLoadedModel()
 {
-    emit trySwitchContextOfLoadedModelAttempted();
-    m_llmodel->setShouldTrySwitchContext(true);
+    m_trySwitchContextInProgress = 1;
+    emit trySwitchContextInProgressChanged();
+    m_llmodel->requestTrySwitchContext();
 }

 void Chat::generatedNameChanged(const QString &name)
@ -342,8 +336,10 @@ void Chat::handleRecalculating()

 void Chat::handleModelLoadingError(const QString &error)
 {
-    auto stream = qWarning().noquote() << "ERROR:" << error << "id";
-    stream.quote() << id();
+    if (!error.isEmpty()) {
+        auto stream = qWarning().noquote() << "ERROR:" << error << "id";
+        stream.quote() << id();
+    }
    m_modelLoadingError = error;
    emit modelLoadingErrorChanged();
 }
@ -380,6 +376,11 @@ void Chat::handleModelInfoChanged(const ModelInfo &modelInfo)
    emit modelInfoChanged();
 }

+void Chat::handleTrySwitchContextOfLoadedModelCompleted(int value) {
+    m_trySwitchContextInProgress = value;
+    emit trySwitchContextInProgressChanged();
+}
+
 bool Chat::serialize(QDataStream &stream, int version) const
 {
    stream << m_creationDate;
--- a/gpt4all-chat/chat.h
+++ b/gpt4all-chat/chat.h
@ -17,6 +17,7 @@ class Chat : public QObject
    Q_PROPERTY(QString name READ name WRITE setName NOTIFY nameChanged)
    Q_PROPERTY(ChatModel *chatModel READ chatModel NOTIFY chatModelChanged)
    Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
+    Q_PROPERTY(bool isCurrentlyLoading READ isCurrentlyLoading NOTIFY isCurrentlyLoadingChanged)
    Q_PROPERTY(float modelLoadingPercentage READ modelLoadingPercentage NOTIFY modelLoadingPercentageChanged)
    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
    Q_PROPERTY(ModelInfo modelInfo READ modelInfo WRITE setModelInfo NOTIFY modelInfoChanged)
@ -30,6 +31,8 @@ class Chat : public QObject
    Q_PROPERTY(QString device READ device NOTIFY deviceChanged);
    Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY fallbackReasonChanged);
    Q_PROPERTY(LocalDocsCollectionsModel *collectionModel READ collectionModel NOTIFY collectionModelChanged)
+    // 0=no, 1=waiting, 2=working
+    Q_PROPERTY(int trySwitchContextInProgress READ trySwitchContextInProgress NOTIFY trySwitchContextInProgressChanged)
    QML_ELEMENT
    QML_UNCREATABLE("Only creatable from c++!")

@ -62,8 +65,9 @@ public:

    Q_INVOKABLE void reset();
    Q_INVOKABLE void processSystemPrompt();
-    Q_INVOKABLE bool isModelLoaded() const;
-    Q_INVOKABLE float modelLoadingPercentage() const;
+    bool  isModelLoaded()          const { return m_modelLoadingPercentage == 1.0f; }
+    bool  isCurrentlyLoading()     const { return m_modelLoadingPercentage > 0.0f && m_modelLoadingPercentage < 1.0f; }
+    float modelLoadingPercentage() const { return m_modelLoadingPercentage; }
    Q_INVOKABLE void prompt(const QString &prompt);
    Q_INVOKABLE void regenerateResponse();
    Q_INVOKABLE void stopGenerating();
@ -105,6 +109,8 @@ public:
    QString device() const { return m_device; }
    QString fallbackReason() const { return m_fallbackReason; }

+    int trySwitchContextInProgress() const { return m_trySwitchContextInProgress; }
+
 public Q_SLOTS:
    void serverNewPromptResponsePair(const QString &prompt);

@ -113,6 +119,7 @@ Q_SIGNALS:
    void nameChanged();
    void chatModelChanged();
    void isModelLoadedChanged();
+    void isCurrentlyLoadingChanged();
    void modelLoadingPercentageChanged();
    void modelLoadingWarning(const QString &warning);
    void responseChanged();
@ -136,8 +143,7 @@ Q_SIGNALS:
    void deviceChanged();
    void fallbackReasonChanged();
    void collectionModelChanged();
-    void trySwitchContextOfLoadedModelAttempted();
-    void trySwitchContextOfLoadedModelCompleted(bool);
+    void trySwitchContextInProgressChanged();

 private Q_SLOTS:
    void handleResponseChanged(const QString &response);
@ -152,6 +158,7 @@ private Q_SLOTS:
    void handleFallbackReasonChanged(const QString &device);
    void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
    void handleModelInfoChanged(const ModelInfo &modelInfo);
+    void handleTrySwitchContextOfLoadedModelCompleted(int value);

 private:
    QString m_id;
@ -176,6 +183,8 @@ private:
    float m_modelLoadingPercentage = 0.0f;
    LocalDocsCollectionsModel *m_collectionModel;
    bool m_firstResponse = true;
+    int m_trySwitchContextInProgress = 0;
+    bool m_isCurrentlyLoading = false;
 };

 #endif // CHAT_H
--- a/gpt4all-chat/chatlistmodel.cpp
+++ b/gpt4all-chat/chatlistmodel.cpp
@ -15,18 +15,19 @@ ChatListModel *ChatListModel::globalInstance()
 }

 ChatListModel::ChatListModel()
-    : QAbstractListModel(nullptr)
+    : QAbstractListModel(nullptr) {}
+
+void ChatListModel::loadChats()
 {
    addChat();

    ChatsRestoreThread *thread = new ChatsRestoreThread;
-    connect(thread, &ChatsRestoreThread::chatRestored, this, &ChatListModel::restoreChat);
-    connect(thread, &ChatsRestoreThread::finished, this, &ChatListModel::chatsRestoredFinished);
+    connect(thread, &ChatsRestoreThread::chatRestored, this, &ChatListModel::restoreChat, Qt::QueuedConnection);
+    connect(thread, &ChatsRestoreThread::finished, this, &ChatListModel::chatsRestoredFinished, Qt::QueuedConnection);
    connect(thread, &ChatsRestoreThread::finished, thread, &QObject::deleteLater);
    thread->start();

    connect(MySettings::globalInstance(), &MySettings::serverChatChanged, this, &ChatListModel::handleServerEnabledChanged);
-
 }

 void ChatListModel::removeChatFile(Chat *chat) const
--- a/gpt4all-chat/chatlistmodel.h
+++ b/gpt4all-chat/chatlistmodel.h
@ -81,11 +81,15 @@ public:
    bool shouldSaveChatGPTChats() const;
    void setShouldSaveChatGPTChats(bool b);

+    Q_INVOKABLE void loadChats();
+
    Q_INVOKABLE void addChat()
    {
-        // Don't add a new chat if we already have one
-        if (m_newChat)
+        // Select the existing new chat if we already have one
+        if (m_newChat) {
+            setCurrentChat(m_newChat);
            return;
+        }

        // Create a new chat pointer and connect it to determine when it is populated
        m_newChat = new Chat(this);
@ -114,20 +118,6 @@ public:
        emit countChanged();
    }

-    void setNewChat(Chat* chat)
-    {
-        // Don't add a new chat if we already have one
-        if (m_newChat)
-            return;
-
-        m_newChat = chat;
-        connect(m_newChat->chatModel(), &ChatModel::countChanged,
-            this, &ChatListModel::newChatCountChanged);
-        connect(m_newChat, &Chat::nameChanged,
-            this, &ChatListModel::nameChanged);
-        setCurrentChat(m_newChat);
-    }
-
    Q_INVOKABLE void removeChat(Chat* chat)
    {
        Q_ASSERT(chat != m_serverChat);
@ -195,7 +185,11 @@ public:
    int count() const { return m_chats.size(); }

    // stop ChatLLM threads for clean shutdown
-    void destroyChats() { for (auto *chat: m_chats) { chat->destroy(); } }
+    void destroyChats()
+    {
+        for (auto *chat: m_chats) { chat->destroy(); }
+        ChatLLM::destroyStore();
+    }

    void removeChatFile(Chat *chat) const;
    Q_INVOKABLE void saveChats();
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -30,16 +30,17 @@ public:
    static LLModelStore *globalInstance();

    LLModelInfo acquireModel(); // will block until llmodel is ready
-    void releaseModel(const LLModelInfo &info); // must be called when you are done
+    void releaseModel(LLModelInfo &&info); // must be called when you are done
+    void destroy();

 private:
    LLModelStore()
    {
        // seed with empty model
-        m_availableModels.append(LLModelInfo());
+        m_availableModel = LLModelInfo();
    }
    ~LLModelStore() {}
-    QVector<LLModelInfo> m_availableModels;
+    std::optional<LLModelInfo> m_availableModel;
    QMutex m_mutex;
    QWaitCondition m_condition;
    friend class MyLLModelStore;
@ -55,19 +56,27 @@ LLModelStore *LLModelStore::globalInstance()
 LLModelInfo LLModelStore::acquireModel()
 {
    QMutexLocker locker(&m_mutex);
-    while (m_availableModels.isEmpty())
+    while (!m_availableModel)
        m_condition.wait(locker.mutex());
-    return m_availableModels.takeFirst();
+    auto first = std::move(*m_availableModel);
+    m_availableModel.reset();
+    return first;
 }

-void LLModelStore::releaseModel(const LLModelInfo &info)
+void LLModelStore::releaseModel(LLModelInfo &&info)
 {
    QMutexLocker locker(&m_mutex);
-    m_availableModels.append(info);
-    Q_ASSERT(m_availableModels.count() < 2);
+    Q_ASSERT(!m_availableModel);
+    m_availableModel = std::move(info);
    m_condition.wakeAll();
 }

+void LLModelStore::destroy()
+{
+    QMutexLocker locker(&m_mutex);
+    m_availableModel.reset();
+}
+
 ChatLLM::ChatLLM(Chat *parent, bool isServer)
    : QObject{nullptr}
    , m_promptResponseTokens(0)
@ -76,7 +85,6 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
    , m_shouldBeLoaded(false)
    , m_forceUnloadModel(false)
    , m_markedForDeletion(false)
-    , m_shouldTrySwitchContext(false)
    , m_stopGenerating(false)
    , m_timer(nullptr)
    , m_isServer(isServer)
@ -88,7 +96,7 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
    moveToThread(&m_llmThread);
    connect(this, &ChatLLM::shouldBeLoadedChanged, this, &ChatLLM::handleShouldBeLoadedChanged,
        Qt::QueuedConnection); // explicitly queued
-    connect(this, &ChatLLM::shouldTrySwitchContextChanged, this, &ChatLLM::handleShouldTrySwitchContextChanged,
+    connect(this, &ChatLLM::trySwitchContextRequested, this, &ChatLLM::trySwitchContextOfLoadedModel,
        Qt::QueuedConnection); // explicitly queued
    connect(parent, &Chat::idChanged, this, &ChatLLM::handleChatIdChanged);
    connect(&m_llmThread, &QThread::started, this, &ChatLLM::handleThreadStarted);
@ -108,7 +116,8 @@ ChatLLM::~ChatLLM()
    destroy();
 }

-void ChatLLM::destroy() {
+void ChatLLM::destroy()
+{
    m_stopGenerating = true;
    m_llmThread.quit();
    m_llmThread.wait();
@ -116,11 +125,15 @@ void ChatLLM::destroy() {
    // The only time we should have a model loaded here is on shutdown
    // as we explicitly unload the model in all other circumstances
    if (isModelLoaded()) {
-        delete m_llModelInfo.model;
-        m_llModelInfo.model = nullptr;
+        m_llModelInfo.model.reset();
    }
 }

+void ChatLLM::destroyStore()
+{
+    LLModelStore::globalInstance()->destroy();
+}
+
 void ChatLLM::handleThreadStarted()
 {
    m_timer = new TokenTimer(this);
@ -130,7 +143,7 @@ void ChatLLM::handleThreadStarted()

 void ChatLLM::handleForceMetalChanged(bool forceMetal)
 {
-#if defined(Q_OS_MAC) && defined(__arm__)
+#if defined(Q_OS_MAC) && defined(__aarch64__)
    m_forceMetal = forceMetal;
    if (isModelLoaded() && m_shouldBeLoaded) {
        m_reloadingToChangeVariant = true;
@ -161,7 +174,7 @@ bool ChatLLM::loadDefaultModel()
    return loadModel(defaultModel);
 }

-bool ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)
+void ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)
 {
    // We're trying to see if the store already has the model fully loaded that we wish to use
    // and if so we just acquire it from the store and switch the context and return true. If the
@ -169,10 +182,11 @@ bool ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)

    // If we're already loaded or a server or we're reloading to change the variant/device or the
    // modelInfo is empty, then this should fail
-    if (isModelLoaded() || m_isServer || m_reloadingToChangeVariant || modelInfo.name().isEmpty()) {
-        m_shouldTrySwitchContext = false;
-        emit trySwitchContextOfLoadedModelCompleted(false);
-        return false;
+    if (
+        isModelLoaded() || m_isServer || m_reloadingToChangeVariant || modelInfo.name().isEmpty() || !m_shouldBeLoaded
+    ) {
+        emit trySwitchContextOfLoadedModelCompleted(0);
+        return;
    }

    QString filePath = modelInfo.dirpath + modelInfo.filename();
@ -180,33 +194,28 @@ bool ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)

    m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
 #if defined(DEBUG_MODEL_LOADING)
-        qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model;
+        qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif

    // The store gave us no already loaded model, the wrong type of model, then give it back to the
    // store and fail
-    if (!m_llModelInfo.model || m_llModelInfo.fileInfo != fileInfo) {
-        LLModelStore::globalInstance()->releaseModel(m_llModelInfo);
-        m_llModelInfo = LLModelInfo();
-        m_shouldTrySwitchContext = false;
-        emit trySwitchContextOfLoadedModelCompleted(false);
-        return false;
+    if (!m_llModelInfo.model || m_llModelInfo.fileInfo != fileInfo || !m_shouldBeLoaded) {
+        LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
+        emit trySwitchContextOfLoadedModelCompleted(0);
+        return;
    }

 #if defined(DEBUG_MODEL_LOADING)
-    qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model;
+    qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif

-    // We should be loaded and now we are
-    m_shouldBeLoaded = true;
-    m_shouldTrySwitchContext = false;
+    emit trySwitchContextOfLoadedModelCompleted(2);

    // Restore, signal and process
    restoreState();
    emit modelLoadingPercentageChanged(1.0f);
-    emit trySwitchContextOfLoadedModelCompleted(true);
+    emit trySwitchContextOfLoadedModelCompleted(0);
    processSystemPrompt();
-    return true;
 }

 bool ChatLLM::loadModel(const ModelInfo &modelInfo)
@ -223,6 +232,13 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
    if (isModelLoaded() && this->modelInfo() == modelInfo)
        return true;

+    // reset status
+    emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
+    emit modelLoadingError("");
+    emit reportFallbackReason("");
+    emit reportDevice("");
+    m_pristineLoadedState = false;
+
    QString filePath = modelInfo.dirpath + modelInfo.filename();
    QFileInfo fileInfo(filePath);

@ -231,28 +247,25 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
    if (alreadyAcquired) {
        resetContext();
 #if defined(DEBUG_MODEL_LOADING)
-        qDebug() << "already acquired model deleted" << m_llmThread.objectName() << m_llModelInfo.model;
+        qDebug() << "already acquired model deleted" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
-        delete m_llModelInfo.model;
-        m_llModelInfo.model = nullptr;
-        emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
+        m_llModelInfo.model.reset();
    } else if (!m_isServer) {
        // This is a blocking call that tries to retrieve the model we need from the model store.
        // If it succeeds, then we just have to restore state. If the store has never had a model
        // returned to it, then the modelInfo.model pointer should be null which will happen on startup
        m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
 #if defined(DEBUG_MODEL_LOADING)
-        qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model;
+        qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
        // At this point it is possible that while we were blocked waiting to acquire the model from the
        // store, that our state was changed to not be loaded. If this is the case, release the model
        // back into the store and quit loading
        if (!m_shouldBeLoaded) {
 #if defined(DEBUG_MODEL_LOADING)
-            qDebug() << "no longer need model" << m_llmThread.objectName() << m_llModelInfo.model;
+            qDebug() << "no longer need model" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
-            LLModelStore::globalInstance()->releaseModel(m_llModelInfo);
-            m_llModelInfo = LLModelInfo();
+            LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
            emit modelLoadingPercentageChanged(0.0f);
            return false;
        }
@ -260,7 +273,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
        // Check if the store just gave us exactly the model we were looking for
        if (m_llModelInfo.model && m_llModelInfo.fileInfo == fileInfo && !m_reloadingToChangeVariant) {
 #if defined(DEBUG_MODEL_LOADING)
-            qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model;
+            qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
            restoreState();
            emit modelLoadingPercentageChanged(1.0f);
@ -274,10 +287,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
        } else {
            // Release the memory since we have to switch to a different model.
 #if defined(DEBUG_MODEL_LOADING)
-            qDebug() << "deleting model" << m_llmThread.objectName() << m_llModelInfo.model;
+            qDebug() << "deleting model" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
-            delete m_llModelInfo.model;
-            m_llModelInfo.model = nullptr;
+            m_llModelInfo.model.reset();
        }
    }

@ -307,24 +319,35 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
            model->setModelName(modelName);
            model->setRequestURL(modelInfo.url());
            model->setAPIKey(apiKey);
-            m_llModelInfo.model = model;
+            m_llModelInfo.model.reset(model);
        } else {
            QElapsedTimer modelLoadTimer;
            modelLoadTimer.start();

+            auto requestedDevice = MySettings::globalInstance()->device();
            auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
            m_ctx.n_ctx = n_ctx;
            auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);

-            std::string buildVariant = "auto";
-#if defined(Q_OS_MAC) && defined(__arm__)
-            if (m_forceMetal)
-                buildVariant = "metal";
+            std::string backend = "auto";
+#ifdef Q_OS_MAC
+            if (requestedDevice == "CPU") {
+                backend = "cpu";
+            } else if (m_forceMetal) {
+#ifdef __aarch64__
+                backend = "metal";
 #endif
+            }
+#else // !defined(Q_OS_MAC)
+            if (requestedDevice.startsWith("CUDA: "))
+                backend = "cuda";
+#endif
+
            QString constructError;
-            m_llModelInfo.model = nullptr;
+            m_llModelInfo.model.reset();
            try {
-                m_llModelInfo.model = LLModel::Implementation::construct(filePath.toStdString(), buildVariant, n_ctx);
+                auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
+                m_llModelInfo.model.reset(model);
            } catch (const LLModel::MissingImplementationError &e) {
                modelLoadProps.insert("error", "missing_model_impl");
                constructError = e.what();
@ -350,12 +373,11 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                }

                m_llModelInfo.model->setProgressCallback([this](float progress) -> bool {
+                    progress = std::max(progress, std::numeric_limits<float>::min()); // keep progress above zero
                    emit modelLoadingPercentageChanged(progress);
                    return m_shouldBeLoaded;
                });

-                emit reportFallbackReason(""); // no fallback yet
-
                auto approxDeviceMemGB = [](const LLModel::GPUDevice *dev) {
                    float memGB = dev->heapSize / float(1024 * 1024 * 1024);
                    return std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
@ -366,6 +388,8 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                {
                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
                    availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
+                    // Pick the best device
+                    // NB: relies on the fact that Kompute devices are listed first
                    if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
                        defaultDevice = &availableDevices.front();
                        float memGB = defaultDevice->heapSize / float(1024 * 1024 * 1024);
@ -375,16 +399,18 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    }
                }

-                const QString requestedDevice = MySettings::globalInstance()->device();
-                bool isMetal = m_llModelInfo.model->implementation().buildVariant() == "metal";
+                QString actualDevice("CPU");

-                // Pick the best match for the device
-                QString actualDevice = isMetal ? "Metal" : "CPU";
-                if (!isMetal && requestedDevice != "CPU") {
+#if defined(Q_OS_MAC) && defined(__aarch64__)
+                if (m_llModelInfo.model->implementation().buildVariant() == "metal")
+                    actualDevice = "Metal";
+#else
+                if (requestedDevice != "CPU") {
                    const auto *device = defaultDevice;
                    if (requestedDevice != "Auto") {
+                        // Use the selected device
                        for (const LLModel::GPUDevice &d : availableDevices) {
-                            if (QString::fromStdString(d.name) == requestedDevice) {
+                            if (QString::fromStdString(d.selectionName()) == requestedDevice) {
                                device = &d;
                                break;
                            }
@ -397,15 +423,25 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
                        emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
                    } else {
-                        actualDevice = QString::fromStdString(device->name);
+                        actualDevice = QString::fromStdString(device->reportedName());
                        modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
                    }
                }
+#endif

                // Report which device we're actually using
                emit reportDevice(actualDevice);
-
                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
+
+                if (!m_shouldBeLoaded) {
+                    m_llModelInfo.model.reset();
+                    if (!m_isServer)
+                        LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
+                    m_llModelInfo = LLModelInfo();
+                    emit modelLoadingPercentageChanged(0.0f);
+                    return false;
+                }
+
                if (actualDevice == "CPU") {
                    // we asked llama.cpp to use the CPU
                } else if (!success) {
@ -414,6 +450,15 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    emit reportFallbackReason("<br>GPU loading failed (out of VRAM?)");
                    modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
+
+                    if (!m_shouldBeLoaded) {
+                        m_llModelInfo.model.reset();
+                        if (!m_isServer)
+                            LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
+                        m_llModelInfo = LLModelInfo();
+                        emit modelLoadingPercentageChanged(0.0f);
+                        return false;
+                    }
                } else if (!m_llModelInfo.model->usingGPUDevice()) {
                    // ggml_vk_init was not called in llama.cpp
                    // We might have had to fallback to CPU after load if the model is not possible to accelerate
@ -424,10 +469,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                }

                if (!success) {
-                    delete m_llModelInfo.model;
-                    m_llModelInfo.model = nullptr;
+                    m_llModelInfo.model.reset();
                    if (!m_isServer)
-                        LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
+                        LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
                    m_llModelInfo = LLModelInfo();
                    emit modelLoadingError(QString("Could not load model due to invalid model file for %1").arg(modelInfo.filename()));
                    modelLoadProps.insert("error", "loadmodel_failed");
@ -437,10 +481,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    case 'G': m_llModelType = LLModelType::GPTJ_; break;
                    default:
                        {
-                            delete m_llModelInfo.model;
-                            m_llModelInfo.model = nullptr;
+                            m_llModelInfo.model.reset();
                            if (!m_isServer)
-                                LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
+                                LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
                            m_llModelInfo = LLModelInfo();
                            emit modelLoadingError(QString("Could not determine model type for %1").arg(modelInfo.filename()));
                        }
@ -450,13 +493,13 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                }
            } else {
                if (!m_isServer)
-                    LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
+                    LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
                m_llModelInfo = LLModelInfo();
                emit modelLoadingError(QString("Error loading %1: %2").arg(modelInfo.filename()).arg(constructError));
            }
        }
 #if defined(DEBUG_MODEL_LOADING)
-        qDebug() << "new model" << m_llmThread.objectName() << m_llModelInfo.model;
+        qDebug() << "new model" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
        restoreState();
 #if defined(DEBUG)
@ -470,7 +513,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
        Network::globalInstance()->trackChatEvent("model_load", modelLoadProps);
    } else {
        if (!m_isServer)
-            LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
+            LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); // release back into the store
        m_llModelInfo = LLModelInfo();
        emit modelLoadingError(QString("Could not find file for model %1").arg(modelInfo.filename()));
    }
@ -479,7 +522,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
        setModelInfo(modelInfo);
        processSystemPrompt();
    }
-    return m_llModelInfo.model;
+    return bool(m_llModelInfo.model);
 }

 bool ChatLLM::isModelLoaded() const
@ -699,22 +742,23 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
        emit responseChanged(QString::fromStdString(m_response));
    }
    emit responseStopped(elapsed);
+    m_pristineLoadedState = false;
    return true;
 }

 void ChatLLM::setShouldBeLoaded(bool b)
 {
 #if defined(DEBUG_MODEL_LOADING)
-    qDebug() << "setShouldBeLoaded" << m_llmThread.objectName() << b << m_llModelInfo.model;
+    qDebug() << "setShouldBeLoaded" << m_llmThread.objectName() << b << m_llModelInfo.model.get();
 #endif
    m_shouldBeLoaded = b; // atomic
    emit shouldBeLoadedChanged();
 }

-void ChatLLM::setShouldTrySwitchContext(bool b)
+void ChatLLM::requestTrySwitchContext()
 {
-    m_shouldTrySwitchContext = b; // atomic
-    emit shouldTrySwitchContextChanged();
+    m_shouldBeLoaded = true; // atomic
+    emit trySwitchContextRequested(modelInfo());
 }

 void ChatLLM::handleShouldBeLoadedChanged()
@ -725,12 +769,6 @@ void ChatLLM::handleShouldBeLoadedChanged()
        unloadModel();
 }

-void ChatLLM::handleShouldTrySwitchContextChanged()
-{
-    if (m_shouldTrySwitchContext)
-        trySwitchContextOfLoadedModel(modelInfo());
-}
-
 void ChatLLM::unloadModel()
 {
    if (!isModelLoaded() || m_isServer)
@ -745,17 +783,16 @@ void ChatLLM::unloadModel()
        saveState();

 #if defined(DEBUG_MODEL_LOADING)
-    qDebug() << "unloadModel" << m_llmThread.objectName() << m_llModelInfo.model;
+    qDebug() << "unloadModel" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif

    if (m_forceUnloadModel) {
-        delete m_llModelInfo.model;
-        m_llModelInfo.model = nullptr;
+        m_llModelInfo.model.reset();
        m_forceUnloadModel = false;
    }

-    LLModelStore::globalInstance()->releaseModel(m_llModelInfo);
-    m_llModelInfo = LLModelInfo();
+    LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
+    m_pristineLoadedState = false;
 }

 void ChatLLM::reloadModel()
@ -767,7 +804,7 @@ void ChatLLM::reloadModel()
        return;

 #if defined(DEBUG_MODEL_LOADING)
-    qDebug() << "reloadModel" << m_llmThread.objectName() << m_llModelInfo.model;
+    qDebug() << "reloadModel" << m_llmThread.objectName() << m_llModelInfo.model.get();
 #endif
    const ModelInfo m = modelInfo();
    if (m.name().isEmpty())
@ -782,18 +819,19 @@ void ChatLLM::generateName()
    if (!isModelLoaded())
        return;

-    std::string instructPrompt("### Instruction:\n%1\n### Response:\n"); // standard Alpaca
+    auto promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
    auto promptFunc = std::bind(&ChatLLM::handleNamePrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleNameResponse, this, std::placeholders::_1, std::placeholders::_2);
    auto recalcFunc = std::bind(&ChatLLM::handleNameRecalculate, this, std::placeholders::_1);
    LLModel::PromptContext ctx = m_ctx;
-    m_llModelInfo.model->prompt("Describe response above in three words.", instructPrompt, promptFunc, responseFunc,
-                                recalcFunc, ctx);
+    m_llModelInfo.model->prompt("Describe the above conversation in three words or less.",
+                                promptTemplate.toStdString(), promptFunc, responseFunc, recalcFunc, ctx);
    std::string trimmed = trim_whitespace(m_nameResponse);
    if (trimmed != m_nameResponse) {
        m_nameResponse = trimmed;
        emit generatedNameChanged(QString::fromStdString(m_nameResponse));
    }
+    m_pristineLoadedState = false;
 }

 void ChatLLM::handleChatIdChanged(const QString &id)
@ -933,7 +971,10 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,
    // If we do not deserialize the KV or it is discarded, then we need to restore the state from the
    // text only. This will be a costly operation, but the chat has to be restored from the text archive
    // alone.
-    m_restoreStateFromText = !deserializeKV || discardKV;
+    if (!deserializeKV || discardKV) {
+        m_restoreStateFromText = true;
+        m_pristineLoadedState = true;
+    }

    if (!deserializeKV) {
 #if defined(DEBUG)
@ -997,14 +1038,14 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,

 void ChatLLM::saveState()
 {
-    if (!isModelLoaded())
+    if (!isModelLoaded() || m_pristineLoadedState)
        return;

    if (m_llModelType == LLModelType::API_) {
        m_state.clear();
        QDataStream stream(&m_state, QIODeviceBase::WriteOnly);
        stream.setVersion(QDataStream::Qt_6_4);
-        ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model);
+        ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model.get());
        stream << chatAPI->context();
        return;
    }
@ -1025,7 +1066,7 @@ void ChatLLM::restoreState()
    if (m_llModelType == LLModelType::API_) {
        QDataStream stream(&m_state, QIODeviceBase::ReadOnly);
        stream.setVersion(QDataStream::Qt_6_4);
-        ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model);
+        ChatAPI *chatAPI = static_cast<ChatAPI*>(m_llModelInfo.model.get());
        QList<QString> context;
        stream >> context;
        chatAPI->setContext(context);
@ -1044,13 +1085,18 @@ void ChatLLM::restoreState()
    if (m_llModelInfo.model->stateSize() == m_state.size()) {
        m_llModelInfo.model->restoreState(static_cast<const uint8_t*>(reinterpret_cast<void*>(m_state.data())));
        m_processedSystemPrompt = true;
+        m_pristineLoadedState = true;
    } else {
        qWarning() << "restoring state from text because" << m_llModelInfo.model->stateSize() << "!=" << m_state.size();
        m_restoreStateFromText = true;
    }

-    m_state.clear();
-    m_state.squeeze();
+    // free local state copy unless unload is pending
+    if (m_shouldBeLoaded) {
+        m_state.clear();
+        m_state.squeeze();
+        m_pristineLoadedState = false;
+    }
 }

 void ChatLLM::processSystemPrompt()
@ -1104,6 +1150,7 @@ void ChatLLM::processSystemPrompt()
 #endif

    m_processedSystemPrompt = m_stopGenerating == false;
+    m_pristineLoadedState = false;
 }

 void ChatLLM::processRestoreStateFromText()
@ -1162,4 +1209,6 @@ void ChatLLM::processRestoreStateFromText()

    m_isRecalc = false;
    emit recalcChanged();
+
+    m_pristineLoadedState = false;
 }
--- a/gpt4all-chat/chatllm.h
+++ b/gpt4all-chat/chatllm.h
@ -5,6 +5,8 @@
 #include <QThread>
 #include <QFileInfo>

+#include <memory>
+
 #include "database.h"
 #include "modellist.h"
 #include "../gpt4all-backend/llmodel.h"
@ -16,7 +18,7 @@ enum LLModelType {
 };

 struct LLModelInfo {
-    LLModel *model = nullptr;
+    std::unique_ptr<LLModel> model;
    QFileInfo fileInfo;
    // NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
    // must be able to serialize the information even if it is in the unloaded state
@ -72,6 +74,7 @@ public:
    virtual ~ChatLLM();

    void destroy();
+    static void destroyStore();
    bool isModelLoaded() const;
    void regenerateResponse();
    void resetResponse();
@ -81,7 +84,7 @@ public:

    bool shouldBeLoaded() const { return m_shouldBeLoaded; }
    void setShouldBeLoaded(bool b);
-    void setShouldTrySwitchContext(bool b);
+    void requestTrySwitchContext();
    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }

@ -101,7 +104,7 @@ public:
 public Q_SLOTS:
    bool prompt(const QList<QString> &collectionList, const QString &prompt);
    bool loadDefaultModel();
-    bool trySwitchContextOfLoadedModel(const ModelInfo &modelInfo);
+    void trySwitchContextOfLoadedModel(const ModelInfo &modelInfo);
    bool loadModel(const ModelInfo &modelInfo);
    void modelChangeRequested(const ModelInfo &modelInfo);
    void unloadModel();
@ -109,7 +112,6 @@ public Q_SLOTS:
    void generateName();
    void handleChatIdChanged(const QString &id);
    void handleShouldBeLoadedChanged();
-    void handleShouldTrySwitchContextChanged();
    void handleThreadStarted();
    void handleForceMetalChanged(bool forceMetal);
    void handleDeviceChanged();
@ -128,8 +130,8 @@ Q_SIGNALS:
    void stateChanged();
    void threadStarted();
    void shouldBeLoadedChanged();
-    void shouldTrySwitchContextChanged();
-    void trySwitchContextOfLoadedModelCompleted(bool);
+    void trySwitchContextRequested(const ModelInfo &modelInfo);
+    void trySwitchContextOfLoadedModelCompleted(int value);
    void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
    void reportSpeed(const QString &speed);
    void reportDevice(const QString &device);
@ -172,7 +174,6 @@ private:
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
-    std::atomic<bool> m_shouldTrySwitchContext;
    std::atomic<bool> m_isRecalc;
    std::atomic<bool> m_forceUnloadModel;
    std::atomic<bool> m_markedForDeletion;
@ -181,6 +182,10 @@ private:
    bool m_reloadingToChangeVariant;
    bool m_processedSystemPrompt;
    bool m_restoreStateFromText;
+    // m_pristineLoadedState is set if saveSate is unnecessary, either because:
+    // - an unload was queued during LLModel::restoreState()
+    // - the chat will be restored from text and hasn't been interacted with yet
+    bool m_pristineLoadedState = false;
    QVector<QPair<QString, QString>> m_stateFromText;
 };

--- a/gpt4all-chat/cmake/deploy-qt-linux.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-linux.cmake.in
@ -5,10 +5,7 @@ set(DATA_DIR ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN
 set(BIN_DIR ${DATA_DIR}/bin)
 set(Qt6_ROOT_DIR "@Qt6_ROOT_DIR@")
 set(ENV{LD_LIBRARY_PATH} "${BIN_DIR}:${Qt6_ROOT_DIR}/../lib/")
-execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2)
-file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
-file(COPY ${MYLLMODELLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
+execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2 -exclude-libs=libcuda.so.1)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
     DESTINATION ${DATA_DIR})
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"
--- a/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
@ -4,21 +4,16 @@ set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
 execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2)
 file(GLOB MYGPTJLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libgptj*)
 file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
-file(GLOB MYBERTLLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libbert*)
 file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
 file(COPY ${MYGPTJLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLAMALIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
-file(COPY ${MYBERTLLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLMODELLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.icns"
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Resources)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.icns"
+file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
--- a/gpt4all-chat/cmake/deploy-qt-windows.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-windows.cmake.in
@ -2,12 +2,9 @@ set(WINDEPLOYQT "@WINDEPLOYQT@")
 set(COMPONENT_NAME_MAIN "@COMPONENT_NAME_MAIN@")
 set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
 execute_process(COMMAND ${WINDEPLOYQT} --qmldir ${CMAKE_CURRENT_SOURCE_DIR} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
-file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
-file(COPY ${MYLLMODELLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/favicon.ico"
+file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.ico"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
--- a/gpt4all-chat/cmake/installerscript.qs
+++ b/gpt4all-chat/cmake/installerscript.qs
@ -19,7 +19,7 @@ Component.prototype.createOperations = function()
                    targetDirectory + "/bin/chat.exe",
                    "@UserProfile@/Desktop/GPT4All.lnk",
                    "workingDirectory=" + targetDirectory + "/bin",
-                    "iconPath=" + targetDirectory + "/favicon.ico",
+                    "iconPath=" + targetDirectory + "/gpt4all.ico",
                    "iconId=0", "description=Open GPT4All");
            } catch (e) {
                print("ERROR: creating desktop shortcut" + e);
@ -28,7 +28,7 @@ Component.prototype.createOperations = function()
                targetDirectory + "/bin/chat.exe",
                "@StartMenuDir@/GPT4All.lnk",
                "workingDirectory=" + targetDirectory + "/bin",
-                "iconPath=" + targetDirectory + "/favicon.ico",
+                "iconPath=" + targetDirectory + "/gpt4all.ico",
                "iconId=0", "description=Open GPT4All");
        } else if (systemInfo.productType === "osx") {
            var gpt4allAppPath = targetDirectory + "/bin/gpt4all.app";
--- a/gpt4all-chat/database.cpp
+++ b/gpt4all-chat/database.cpp
@ -552,6 +552,7 @@ Database::~Database()
 {
    m_dbThread.quit();
    m_dbThread.wait();
+    delete m_embLLM;
 }

 void Database::scheduleNext(int folder_id, size_t countForFolder)
--- a/gpt4all-chat/embllm.cpp
+++ b/gpt4all-chat/embllm.cpp
@ -5,6 +5,7 @@ EmbeddingLLMWorker::EmbeddingLLMWorker()
    : QObject(nullptr)
    , m_networkManager(new QNetworkAccessManager(this))
    , m_model(nullptr)
+    , m_stopGenerating(false)
 {
    moveToThread(&m_workerThread);
    connect(this, &EmbeddingLLMWorker::finished, &m_workerThread, &QThread::quit, Qt::DirectConnection);
@ -14,6 +15,10 @@ EmbeddingLLMWorker::EmbeddingLLMWorker()

 EmbeddingLLMWorker::~EmbeddingLLMWorker()
 {
+    m_stopGenerating = true;
+    m_workerThread.quit();
+    m_workerThread.wait();
+
    if (m_model) {
        delete m_model;
        m_model = nullptr;
@ -148,6 +153,9 @@ void EmbeddingLLMWorker::requestSyncEmbedding(const QString &text)
 // this function is always called for storage into the database
 void EmbeddingLLMWorker::requestAsyncEmbedding(const QVector<EmbeddingChunk> &chunks)
 {
+    if (m_stopGenerating)
+        return;
+
    if (!hasModel() && !loadModel()) {
        qWarning() << "WARNING: Could not load model for embeddings";
        return;
--- a/gpt4all-chat/embllm.h
+++ b/gpt4all-chat/embllm.h
@ -58,6 +58,7 @@ private:
    QNetworkAccessManager *m_networkManager;
    std::vector<float> m_lastResponse;
    LLModel *m_model = nullptr;
+    std::atomic<bool> m_stopGenerating;
    QThread m_workerThread;
 };

--- a/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
+++ b/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
@ -2,9 +2,10 @@
 <component type="desktop">
    <id>io.gpt4all.gpt4all</id>
    <metadata_license>CC0-1.0</metadata_license>
-    <project_license>MIT License</project_license>
+    <project_license>MIT</project_license>
    <name>GPT4ALL</name>
-    <summary>Open-source assistant-style large language models that run locally on your CPU and GPU</summary>
+    <summary>Open-source assistant</summary>
+    <developer_name>Nomic-ai</developer_name>
    <description>
        <p>Cross platform Qt based GUI for GPT4All</p>
        <ul>
@ -29,20 +30,12 @@
    <url type="bugtracker">https://github.com/nomic-ai/gpt4all/issues</url>
    <url type="vcs-browser">https://github.com/nomic-ai/gpt4all</url>
    <releases>
-      <release version="2.4.19" date="2023-09-16">
-        <description>
-          <p>
-            <ul>
-              <li>A bugfix for crashes on systems that have a corrupted Vulkan driver or a corrupted version of the Vulkan shared library</li>
-            </ul>
-          </p>
-        </description>
-        </release>              
+      <release version="2.7.5" date="2024-05-03"></release>              
    </releases>
    <launchable type="desktop-id">io.gpt4all.gpt4all.desktop</launchable>
-    <content_rating type="oars-1.0">
+    <content_rating type="oars-1.1">
        <content_attribute id="language-profanity">mild</content_attribute>
        <content_attribute id="language-humor">moderate</content_attribute>
        <content_attribute id="language-discrimination">mild</content_attribute>
    </content_rating>
-</component>
+</component>
--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@ -65,10 +65,14 @@ MySettings::MySettings()
 {
    QSettings::setDefaultFormat(QSettings::IniFormat);

-    std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
    QVector<QString> deviceList{ "Auto" };
+#if defined(Q_OS_MAC) && defined(__aarch64__)
+    deviceList << "Metal";
+#else
+    std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
    for (LLModel::GPUDevice &d : devices)
-        deviceList << QString::fromStdString(d.name);
+        deviceList << QString::fromStdString(d.selectionName());
+#endif
    deviceList << "CPU";
    setDeviceList(deviceList);
 }
@ -786,7 +790,23 @@ QString MySettings::device() const
 {
    QSettings setting;
    setting.sync();
-    return setting.value("device", default_device).toString();
+    auto value = setting.value("device");
+    if (!value.isValid())
+        return default_device;
+
+    auto device = value.toString();
+    if (!device.isEmpty()) {
+        auto deviceStr = device.toStdString();
+        auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr);
+        if (newNameStr != deviceStr) {
+            auto newName = QString::fromStdString(newNameStr);
+            qWarning() << "updating device name:" << device << "->" << newName;
+            device = newName;
+            setting.setValue("device", device);
+            setting.sync();
+        }
+    }
+    return device;
 }

 void MySettings::setDevice(const QString &u)
--- a/gpt4all-chat/qml/ChatDrawer.qml
+++ b/gpt4all-chat/qml/ChatDrawer.qml
@ -39,7 +39,8 @@ Rectangle {
            text: qsTr("\uFF0B New chat")
            Accessible.description: qsTr("Create a new chat")
            onClicked: {
-                ChatListModel.addChat();
+                ChatListModel.addChat()
+                conversationList.positionViewAtIndex(0, ListView.Beginning)
                Network.trackEvent("new_chat", {"number_of_chats": ChatListModel.count})
            }
        }
@ -60,6 +61,9 @@ Rectangle {
                anchors.fill: parent
                anchors.rightMargin: 10
                model: ChatListModel
+
+                Component.onCompleted: ChatListModel.loadChats()
+
                ScrollBar.vertical: ScrollBar {
                    parent: conversationList.parent
                    anchors.top: conversationList.top
--- a/gpt4all-chat/qml/ChatView.qml
+++ b/gpt4all-chat/qml/ChatView.qml
@ -122,10 +122,6 @@ Rectangle {
        return ModelList.modelInfo(currentChat.modelInfo.id).name;
    }

-    property bool isCurrentlyLoading: false
-    property real modelLoadingPercentage: 0.0
-    property bool trySwitchContextInProgress: false
-
    PopupDialog {
        id: errorCompatHardware
        anchors.centerIn: parent
@ -340,34 +336,18 @@ Rectangle {
                    implicitWidth: 575
                    width: window.width >= 750 ? implicitWidth : implicitWidth - (750 - window.width)
                    enabled: !currentChat.isServer
-                        && !window.trySwitchContextInProgress
-                        && !window.isCurrentlyLoading
+                        && !currentChat.trySwitchContextInProgress
+                        && !currentChat.isCurrentlyLoading
                    model: ModelList.installedModels
                    valueRole: "id"
                    textRole: "name"

                    function changeModel(index) {
-                        window.modelLoadingPercentage = 0.0;
-                        window.isCurrentlyLoading = true;
                        currentChat.stopGenerating()
                        currentChat.reset();
                        currentChat.modelInfo = ModelList.modelInfo(comboBox.valueAt(index))
                    }

-                    Connections {
-                        target: currentChat
-                        function onModelLoadingPercentageChanged() {
-                            window.modelLoadingPercentage = currentChat.modelLoadingPercentage;
-                            window.isCurrentlyLoading = currentChat.modelLoadingPercentage !== 0.0
-                                && currentChat.modelLoadingPercentage !== 1.0;
-                        }
-                        function onTrySwitchContextOfLoadedModelAttempted() {
-                            window.trySwitchContextInProgress = true;
-                        }
-                        function onTrySwitchContextOfLoadedModelCompleted() {
-                            window.trySwitchContextInProgress = false;
-                        }
-                    }
                    Connections {
                        target: switchModelDialog
                        function onAccepted() {
@ -377,14 +357,14 @@ Rectangle {

                    background: ProgressBar {
                        id: modelProgress
-                        value: window.modelLoadingPercentage
+                        value: currentChat.modelLoadingPercentage
                        background: Rectangle {
                            color: theme.mainComboBackground
                            radius: 10
                        }
                        contentItem: Item {
                            Rectangle {
-                                visible: window.isCurrentlyLoading
+                                visible: currentChat.isCurrentlyLoading
                                anchors.bottom: parent.bottom
                                width: modelProgress.visualPosition * parent.width
                                height: 10
@ -406,13 +386,15 @@ Rectangle {
                        text: {
                            if (currentChat.modelLoadingError !== "")
                                return qsTr("Model loading error...")
-                            if (window.trySwitchContextInProgress)
+                            if (currentChat.trySwitchContextInProgress == 1)
+                                return qsTr("Waiting for model...")
+                            if (currentChat.trySwitchContextInProgress == 2)
                                return qsTr("Switching context...")
                            if (currentModelName() === "")
                                return qsTr("Choose a model...")
                            if (currentChat.modelLoadingPercentage === 0.0)
                                return qsTr("Reload \u00B7 ") + currentModelName()
-                            if (window.isCurrentlyLoading)
+                            if (currentChat.isCurrentlyLoading)
                                return qsTr("Loading \u00B7 ") + currentModelName()
                            return currentModelName()
                        }
@ -456,7 +438,7 @@ Rectangle {

                    MyMiniButton {
                        id: ejectButton
-                        visible: currentChat.isModelLoaded && !window.isCurrentlyLoading
+                        visible: currentChat.isModelLoaded && !currentChat.isCurrentlyLoading
                        z: 500
                        anchors.right: parent.right
                        anchors.rightMargin: 50
@ -474,8 +456,8 @@ Rectangle {
                    MyMiniButton {
                        id: reloadButton
                        visible: currentChat.modelLoadingError === ""
-                            && !window.trySwitchContextInProgress
-                            && !window.isCurrentlyLoading
+                            && !currentChat.trySwitchContextInProgress
+                            && !currentChat.isCurrentlyLoading
                            && (currentChat.isModelLoaded || currentModelName() !== "")
                        z: 500
                        anchors.right: ejectButton.visible ? ejectButton.left : parent.right
@ -1069,7 +1051,7 @@ Rectangle {
                            anchors.fill: parent
                            acceptedButtons: Qt.RightButton

-                            onClicked: {
+                            onClicked: (mouse) => {
                                if (mouse.button === Qt.RightButton) {
                                    conversationContextMenu.x = conversationMouseArea.mouseX
                                    conversationContextMenu.y = conversationMouseArea.mouseY
@ -1082,11 +1064,19 @@ Rectangle {
                            id: conversationContextMenu
                            MenuItem {
                                text: qsTr("Copy")
+                                enabled: myTextArea.selectedText !== ""
+                                height: enabled ? implicitHeight : 0
                                onTriggered: myTextArea.copy()
                            }
                            MenuItem {
-                                text: qsTr("Select All")
-                                onTriggered: myTextArea.selectAll()
+                                text: qsTr("Copy Message")
+                                enabled: myTextArea.selectedText === ""
+                                height: enabled ? implicitHeight : 0
+                                onTriggered: {
+                                    myTextArea.selectAll()
+                                    myTextArea.copy()
+                                    myTextArea.deselect()
+                                }
                            }
                        }

@ -1336,8 +1326,9 @@ Rectangle {
                textColor: theme.textColor
                visible: !currentChat.isServer
                    && !currentChat.isModelLoaded
-                    && !window.trySwitchContextInProgress
-                    && !window.isCurrentlyLoading
+                    && currentChat.modelLoadingError === ""
+                    && !currentChat.trySwitchContextInProgress
+                    && !currentChat.isCurrentlyLoading
                    && currentModelName() !== ""

                Image {
@ -1408,11 +1399,11 @@ Rectangle {
                Accessible.role: Accessible.EditableText
                Accessible.name: placeholderText
                Accessible.description: qsTr("Send messages/prompts to the model")
-                Keys.onReturnPressed: (event)=> {
-                    if (event.modifiers & Qt.ControlModifier || event.modifiers & Qt.ShiftModifier)
-                        event.accepted = false;
-                    else {
-                        editingFinished();
+                Keys.onReturnPressed: (event) => {
+                    if (event.modifiers & Qt.ControlModifier || event.modifiers & Qt.ShiftModifier) {
+                        event.accepted = false
+                    } else if (!currentChat.responseInProgress) {
+                        editingFinished()
                        sendMessage()
                    }
                }
@ -1441,7 +1432,7 @@ Rectangle {
                    anchors.fill: parent
                    acceptedButtons: Qt.RightButton

-                    onClicked: {
+                    onClicked: (mouse) => {
                        if (mouse.button === Qt.RightButton) {
                            textInputContextMenu.x = textInputMouseArea.mouseX
                            textInputContextMenu.y = textInputMouseArea.mouseY
@ -1454,10 +1445,14 @@ Rectangle {
                    id: textInputContextMenu
                    MenuItem {
                        text: qsTr("Cut")
+                        enabled: textInput.selectedText !== ""
+                        height: enabled ? implicitHeight : 0
                        onTriggered: textInput.cut()
                    }
                    MenuItem {
                        text: qsTr("Copy")
+                        enabled: textInput.selectedText !== ""
+                        height: enabled ? implicitHeight : 0
                        onTriggered: textInput.copy()
                    }
                    MenuItem {
@ -1482,6 +1477,7 @@ Rectangle {
            width: 30
            height: 30
            visible: !currentChat.isServer
+            enabled: !currentChat.responseInProgress
            source: "qrc:/gpt4all/icons/send_message.svg"
            Accessible.name: qsTr("Send message")
            Accessible.description: qsTr("Sends the message/prompt contained in textfield to the model")
--- a/gpt4all-chat/resources/gpt4all.icns
+++ b/gpt4all-chat/resources/gpt4all.icns
--- a/gpt4all-chat/resources/gpt4all.ico
+++ b/gpt4all-chat/resources/gpt4all.ico
--- a/gpt4all-chat/resources/gpt4all.rc
+++ b/gpt4all-chat/resources/gpt4all.rc
@ -0,0 +1 @@
+IDI_ICON1 ICON "gpt4all.ico"
--- a/gpt4all-docker/README.md
+++ b/gpt4all-docker/README.md
@ -1,8 +0,0 @@
-# GPT4All Docker
-This directory will contain Dockerfiles to build out different gpt4all recipes.
-
-For example:
-1. Docker container that builds out gpt4all RESTful API.
-2. Docker container that builds out gpt4all model backends and Python bindings.
-3. Docker container that builds out everything.
-4. etc.
--- a/monorepo_plan.md
+++ b/monorepo_plan.md
@ -1,36 +0,0 @@
-# Monorepo Plan (DRAFT)
-
-## Directory Structure
- gpt4all-api
-    - RESTful API
- gpt4all-backend
-    - C/C++ (ggml) model backends
- gpt4all-bindings
-    - Language bindings for model backends
- gpt4all-chat
-    - Chat GUI
- gpt4all-docker
-    - Dockerfile recipes for various gpt4all builds
- gpt4all-training
-    - Model training/inference/eval code
-
-## Transition Plan:
-This is roughly based on what's feasible now and path of least resistance.
-
-1. Clean up gpt4all-training.
-    - Remove deprecated/unneeded files
-    - Organize into separate training, inference, eval, etc. directories
-
-2. Clean up gpt4all-chat so it roughly has same structures as above 
-    - Separate into gpt4all-chat and gpt4all-backends
-    - Separate model backends into separate subdirectories (e.g. llama, gptj)
-
-3. Develop Python bindings (high priority and in-flight)
-    - Release Python binding as PyPi package
-    - Reimplement [Nomic GPT4All](https://github.com/nomic-ai/nomic/blob/main/nomic/gpt4all/gpt4all.py#L58-L190) to call new Python bindings
-
-4. Develop Dockerfiles for different combinations of model backends and bindings
-    - Dockerfile for just model backend
-    - Dockerfile for model backend and Python bindings
-
-5. Develop RESTful API / FastAPI
Author	SHA1	Message	Date
Jared Van Bortel	a92d266cea	cmake: fix Metal build after #2310 (#2350 ) I don't understand why this is needed, but it works. Signed-off-by: Jared Van Bortel <jared@nomic.ai>	2 weeks ago
Jared Van Bortel	d2a99d9bc6	support the llama.cpp CUDA backend (#2310 ) * rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	a618ca5699	readme: document difference between installers (#2336 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	fbbf810020	chat: fix issues with the initial "New Chat" (#2330 ) * select the existing new chat if there already is one when "New Chat" is clicked * scroll to the new chat when "New Chat" is clicked * fix the "New Chat" being scrolled past the top of the chat list Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	7e1e00f331	chat: fix issues with quickly switching between multiple chats (#2343 ) * prevent load progress from getting out of sync with the current chat * fix memory leak on exit if the LLModelStore contains a model * do not report cancellation as a failure in console/Mixpanel * show "waiting for model" separately from "switching context" in UI * do not show lower "reload" button on error * skip context switch if unload is pending * skip unnecessary calls to LLModel::saveState Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	7f1c3d4275	chatllm: fix model loading progress showing "Reload" sometimes (#2337 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	9f9d8e636f	backend: do not crash if GGUF lacks general.architecture (#2346 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	6d8888b267	llamamodel: free the batch in embedInternal (#2348 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
AT	61cefcfd8a	Fix destruction and tear down of the embedding thread. (#2328 ) * Fix destruction and tear down of the embedding thread. Signed-off-by: Adam Treat <treat.adam@gmail.com> * Fix order of deletion to prevent use after free. Signed-off-by: Adam Treat <treat.adam@gmail.com> --------- Signed-off-by: Adam Treat <treat.adam@gmail.com>	3 weeks ago
Jared Van Bortel	1427ef7195	chat: fix window icon on Windows (#2321 ) * chat: fix window icon on Windows Signed-off-by: Jared Van Bortel <jared@nomic.ai> * chat: remove redundant copy of macOS app icon This has been redundant since PR #2180. Signed-off-by: Jared Van Bortel <jared@nomic.ai> --------- Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Tim453	69720fedaa	Update appdata.xml (#2307 )	3 weeks ago
Jared Van Bortel	86560f3952	maint: remove Docker API server and related references (#2314 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	5fb9d17c00	chatllm: use a better prompt for the generated chat name (#2322 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	f26e8d0d87	chat: do not allow sending a message while the LLM is responding (#2323 ) Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
Jared Van Bortel	d54e644d05	ChatView: make context menus more intuitive (#2324 ) * ChatView: fix deprecation warning Signed-off-by: Jared Van Bortel <jared@nomic.ai> * ChatView: make context menus more intuitive Signed-off-by: Jared Van Bortel <jared@nomic.ai> --------- Signed-off-by: Jared Van Bortel <jared@nomic.ai>	3 weeks ago
				`@ -1 +0,0 @@`
				`### Drop GGUF compatible models here, make sure it matches MODEL_BIN on your .env file`