From 1b84a48c47a382dfa432dbf477a7234402a0f76c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 4 Apr 2024 14:52:13 -0400
Subject: [PATCH] python: add list_gpus to the GPT4All API (#2194)

Other changes:
* fix memory leak in llmodel_available_gpu_devices
* drop model argument from llmodel_available_gpu_devices
* breaking: make GPT4All/Embed4All arguments past model_name keyword-only

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/llmodel.cpp                   |  4 +-
 gpt4all-backend/llmodel.h                     |  2 +-
 gpt4all-backend/llmodel_c.cpp                 | 50 +++++++++++++------
 gpt4all-backend/llmodel_c.h                   |  9 ++--
 gpt4all-bindings/python/gpt4all/_pyllmodel.py | 39 +++++++--------
 gpt4all-bindings/python/gpt4all/gpt4all.py    | 41 ++++++++++-----
 gpt4all-bindings/python/setup.py              |  2 +-
 gpt4all-bindings/typescript/index.cc          |  2 +-
 8 files changed, 91 insertions(+), 58 deletions(-)
diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp
index 8ef941ce..a87fbf80 100644
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -213,9 +213,9 @@ LLModel *LLModel::Implementation::constructDefaultLlama() {
     return llama.get();
 }
 
-std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices() {
+std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
     auto *llama = constructDefaultLlama();
-    if (llama) { return llama->availableGPUDevices(0); }
+    if (llama) { return llama->availableGPUDevices(memoryRequired); }
     return {};
 }
 
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index 17abb80f..2243c087 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -38,7 +38,7 @@ public:
         std::string_view buildVariant() const { return m_buildVariant; }
 
         static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
-        static std::vector<GPUDevice> availableGPUDevices();
+        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
         static int32_t maxContextLength(const std::string &modelPath);
         static int32_t layerCount(const std::string &modelPath);
         static bool isEmbeddingModel(const std::string &modelPath);
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index 950a7320..a046ac76 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -4,6 +4,7 @@
 #include <cerrno>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <optional>
 #include <utility>
 
@@ -221,28 +222,45 @@ const char *llmodel_get_implementation_search_path()
     return LLModel::Implementation::implementationsSearchPath().c_str();
 }
 
-struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
+// RAII wrapper around a C-style struct
+struct llmodel_gpu_device_cpp: llmodel_gpu_device {
+    llmodel_gpu_device_cpp() = default;
 
-    // Set the num_devices
-    *num_devices = devices.size();
+    llmodel_gpu_device_cpp(const llmodel_gpu_device_cpp  &) = delete;
+    llmodel_gpu_device_cpp(      llmodel_gpu_device_cpp &&) = delete;
+
+    const llmodel_gpu_device_cpp &operator=(const llmodel_gpu_device_cpp  &) = delete;
+          llmodel_gpu_device_cpp &operator=(      llmodel_gpu_device_cpp &&) = delete;
+
+    ~llmodel_gpu_device_cpp() {
+        free(const_cast<char *>(name));
+        free(const_cast<char *>(vendor));
+    }
+};
 
-    if (*num_devices == 0) return nullptr;  // Return nullptr if no devices are found
+static_assert(sizeof(llmodel_gpu_device_cpp) == sizeof(llmodel_gpu_device));
 
-    // Allocate memory for the output array
-    struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device));
+struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, int *num_devices)
+{
+    static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
+
+    auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
+    *num_devices = devices.size();
 
-    for (int i = 0; i < *num_devices; i++) {
-        output[i].index = devices[i].index;
-        output[i].type = devices[i].type;
-        output[i].heapSize = devices[i].heapSize;
-        output[i].name = strdup(devices[i].name.c_str());  // Convert std::string to char* and allocate memory
-        output[i].vendor = strdup(devices[i].vendor.c_str());  // Convert std::string to char* and allocate memory
+    if (devices.empty()) { return nullptr; /* no devices */ }
+
+    c_devices = std::make_unique<llmodel_gpu_device_cpp[]>(devices.size());
+    for (unsigned i = 0; i < devices.size(); i++) {
+        const auto &dev  =   devices[i];
+              auto &cdev = c_devices[i];
+        cdev.index    = dev.index;
+        cdev.type     = dev.type;
+        cdev.heapSize = dev.heapSize;
+        cdev.name     = strdup(dev.name.c_str());
+        cdev.vendor   = strdup(dev.vendor.c_str());
     }
 
-    return output;
+    return c_devices.get();
 }
 
 bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
index e26722ca..f7a54734 100644
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -48,9 +48,9 @@ struct llmodel_prompt_context {
 };
 
 struct llmodel_gpu_device {
-    int index = 0;
-    int type = 0;           // same as VkPhysicalDeviceType
-    size_t heapSize = 0;
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
     const char * name;
     const char * vendor;
 };
@@ -241,9 +241,10 @@ const char *llmodel_get_implementation_search_path();
 
 /**
  * Get a list of available GPU devices given the memory required.
+ * @param memoryRequired The minimum amount of VRAM, in bytes
  * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
  */
-struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices);
+struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices);
 
 /**
  * Initializes a GPU device based on a specified string criterion.
diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
index 1c50d0aa..fc1ac6b0 100644
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -138,7 +138,7 @@ llmodel.llmodel_threadCount.restype = ctypes.c_int32
 
 llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).encode())
 
-llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
+llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
 llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice)
 
 llmodel.llmodel_gpu_init_gpu_device_by_string.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p]
@@ -214,13 +214,22 @@ class LLModel:
     def _raise_closed(self) -> NoReturn:
         raise ValueError("Attempted operation on a closed LLModel")
 
-    def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
-        assert self.model is not None
+    @staticmethod
+    def list_gpus(mem_required: int = 0) -> list[str]:
+        """
+        List the names of the available GPU devices with at least `mem_required` bytes of VRAM.
+
+        Args:
+            mem_required: The minimum amount of VRAM, in bytes
+
+        Returns:
+            A list of strings representing the names of the available GPU devices.
+        """
         num_devices = ctypes.c_int32(0)
-        devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
+        devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
         if not devices_ptr:
             raise ValueError("Unable to retrieve available GPU devices")
-        return devices_ptr[:num_devices.value]
+        return [d.name.decode() for d in devices_ptr[:num_devices.value]]
 
     def init_gpu(self, device: str):
         if self.model is None:
@@ -231,23 +240,13 @@ class LLModel:
         if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()):
             return
 
-        # Retrieve all GPUs without considering memory requirements.
-        num_devices = ctypes.c_int32(0)
-        all_devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
-        if not all_devices_ptr:
-            raise ValueError("Unable to retrieve list of all GPU devices")
-        all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]]
-
-        # Retrieve GPUs that meet the memory requirements using list_gpu
-        available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)]
-
-        # Identify GPUs that are unavailable due to insufficient memory or features
+        all_gpus = self.list_gpus()
+        available_gpus = self.list_gpus(mem_required)
         unavailable_gpus = set(all_gpus).difference(available_gpus)
 
-        # Formulate the error message
-        error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
-        error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
-        error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
+        error_msg = "Unable to initialize model on GPU: {!r}".format(device)
+        error_msg += "\nAvailable GPUs: {}".format(available_gpus)
+        error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}".format(unavailable_gpus)
         raise ValueError(error_msg)
 
     def load_model(self) -> bool:
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index b7c11b07..5fef9e5b 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -19,8 +19,7 @@ from requests.exceptions import ChunkedEncodingError
 from tqdm import tqdm
 from urllib3.exceptions import IncompleteRead, ProtocolError
 
-from . import _pyllmodel
-from ._pyllmodel import EmbedResult as EmbedResult
+from ._pyllmodel import EmbedResult as EmbedResult, LLModel, ResponseCallbackType, empty_response_callback
 
 if TYPE_CHECKING:
     from typing_extensions import Self, TypeAlias
@@ -44,16 +43,18 @@ class Embed4All:
 
     MIN_DIMENSIONALITY = 64
 
-    def __init__(self, model_name: str | None = None, n_threads: int | None = None, **kwargs):
+    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
         """
         Constructor
 
         Args:
             n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
+            device: The processing unit on which the embedding model will run. See the `GPT4All` constructor for more info.
+            kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor.
         """
         if model_name is None:
             model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
-        self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs)
+        self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs)
 
     def __enter__(self) -> Self:
         return self
@@ -157,6 +158,7 @@ class GPT4All:
     def __init__(
         self,
         model_name: str,
+        *,
         model_path: str | os.PathLike[str] | None = None,
         model_type: str | None = None,
         allow_download: bool = True,
@@ -181,7 +183,7 @@ class GPT4All:
                 - "cpu": Model will run on the central processing unit.
                 - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
                 - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
-                Alternatively, a specific GPU name can also be provided, and the model will run on the GPU that matches the name if it's available.
+                - A specific device name from the list returned by `GPT4All.list_gpus()`.
                 Default is "cpu".
 
                 Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
@@ -192,7 +194,7 @@ class GPT4All:
         self.model_type = model_type
         # Retrieve model and download if allowed
         self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = _pyllmodel.LLModel(self.config["path"], n_ctx, ngl)
+        self.model = LLModel(self.config["path"], n_ctx, ngl)
         if device is not None and device != "cpu":
             self.model.init_gpu(device)
         self.model.load_model()
@@ -419,19 +421,19 @@ class GPT4All:
     def generate(
         self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
         min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: Literal[False] = ..., callback: _pyllmodel.ResponseCallbackType = ...,
+        n_predict: int | None = ..., streaming: Literal[False] = ..., callback: ResponseCallbackType = ...,
     ) -> str: ...
     @overload
     def generate(
         self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
         min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: Literal[True], callback: _pyllmodel.ResponseCallbackType = ...,
+        n_predict: int | None = ..., streaming: Literal[True], callback: ResponseCallbackType = ...,
     ) -> Iterable[str]: ...
     @overload
     def generate(
         self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
         min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: bool, callback: _pyllmodel.ResponseCallbackType = ...,
+        n_predict: int | None = ..., streaming: bool, callback: ResponseCallbackType = ...,
     ) -> Any: ...
 
     def generate(
@@ -448,7 +450,7 @@ class GPT4All:
         n_batch: int = 8,
         n_predict: int | None = None,
         streaming: bool = False,
-        callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback,
+        callback: ResponseCallbackType = empty_response_callback,
     ) -> Any:
         """
         Generate outputs from any GPT4All model.
@@ -494,7 +496,7 @@ class GPT4All:
                 if reset:
                     # ingest system prompt
                     self.model.prompt_model(self._history[0]["content"], "%1",
-                                            _pyllmodel.empty_response_callback,
+                                            empty_response_callback,
                                             n_batch=n_batch, n_predict=0, special=True)
                 prompt_template = self._current_prompt_template.format("%1", "%2")
             else:
@@ -523,9 +525,9 @@ class GPT4All:
             output_collector = self._history
 
         def _callback_wrapper(
-            callback: _pyllmodel.ResponseCallbackType,
+            callback: ResponseCallbackType,
             output_collector: list[MessageType],
-        ) -> _pyllmodel.ResponseCallbackType:
+        ) -> ResponseCallbackType:
             def _callback(token_id: int, response: str) -> bool:
                 nonlocal callback, output_collector
 
@@ -589,6 +591,16 @@ class GPT4All:
             self._history = None
             self._current_prompt_template = "{0}"
 
+    @staticmethod
+    def list_gpus() -> list[str]:
+        """
+        List the names of the available GPU devices.
+
+        Returns:
+            A list of strings representing the names of the available GPU devices.
+        """
+        return LLModel.list_gpus()
+
     def _format_chat_prompt_template(
         self,
         messages: list[MessageType],
@@ -598,6 +610,9 @@ class GPT4All:
         """
         Helper method for building a prompt from list of messages using the self._current_prompt_template as a template for each message.
 
+        Warning:
+            This function was deprecated in version 2.3.0, and will be removed in a future release.
+
         Args:
             messages:  List of dictionaries. Each dictionary should have a "role" key
                 with value of "system", "assistant", or "user" and a "content" key with a
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index c8b50e86..fc44b256 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -68,7 +68,7 @@ def get_long_description():
 
 setup(
     name=package_name,
-    version="2.3.3",
+    version="2.4.0",
     description="Python bindings for GPT4All",
     long_description=get_long_description(),
     long_description_content_type="text/markdown",
diff --git a/gpt4all-bindings/typescript/index.cc b/gpt4all-bindings/typescript/index.cc
index 0aca7755..8a4349ae 100644
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@@ -36,7 +36,7 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
     auto env = info.Env();
     int num_devices = 0;
     auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
-    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(GetInference(), mem_size, &num_devices);
+    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
     if (all_devices == nullptr)
     {
         Napi::Error::New(env, "Unable to retrieve list of all GPU devices").ThrowAsJavaScriptException();