From ba53ab5da0931b61cf1771aec27a3e390b36575c Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Apr 2024 14:52:02 -0400
Subject: [PATCH] python: do not print GPU name with verbose=False, expose this
 info via properties (#2222)

* llamamodel: only print device used in verbose mode

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: expose backend and device via GPT4All properties

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* backend: const correctness fixes

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: bump version

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: typing fixups

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: fix segfault with closed GPT4All

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

---------

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/llamamodel.cpp                | 27 ++++++++++++---
 gpt4all-backend/llamamodel_impl.h             |  6 ++--
 gpt4all-backend/llmodel.h                     |  6 ++--
 gpt4all-backend/llmodel_c.cpp                 | 14 +++++++-
 gpt4all-backend/llmodel_c.h                   | 10 ++++++
 gpt4all-bindings/python/gpt4all/_pyllmodel.py | 34 +++++++++++++++----
 gpt4all-bindings/python/gpt4all/gpt4all.py    | 10 ++++++
 gpt4all-bindings/python/setup.py              |  2 +-
 8 files changed, 91 insertions(+), 18 deletions(-)
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index dee528ee..15997e7f 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -364,8 +364,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
 
 #ifdef GGML_USE_KOMPUTE
-    if (usingGPUDevice() && ggml_vk_has_device()) {
-        std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+    if (usingGPUDevice()) {
+        if (llama_verbose()) {
+            std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+        }
         d_ptr->backend_name = "kompute";
     }
 #endif
@@ -558,7 +560,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
 #endif
 }
 
-bool LLamaModel::hasGPUDevice()
+bool LLamaModel::hasGPUDevice() const
 {
 #if defined(GGML_USE_KOMPUTE)
     return d_ptr->device != -1;
@@ -567,10 +569,12 @@ bool LLamaModel::hasGPUDevice()
 #endif
 }
 
-bool LLamaModel::usingGPUDevice()
+bool LLamaModel::usingGPUDevice() const
 {
 #if defined(GGML_USE_KOMPUTE)
-    return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+    bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+    assert(!hasDevice || ggml_vk_has_device());
+    return hasDevice;
 #elif defined(GGML_USE_METAL)
     return true;
 #else
@@ -578,6 +582,19 @@ bool LLamaModel::usingGPUDevice()
 #endif
 }
 
+const char *LLamaModel::backendName() const {
+    return d_ptr->backend_name;
+}
+
+const char *LLamaModel::gpuDeviceName() const {
+#if defined(GGML_USE_KOMPUTE)
+    if (usingGPUDevice()) {
+        return ggml_vk_current_device().name;
+    }
+#endif
+    return nullptr;
+}
+
 void llama_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index f4c1a2e6..d36dc6d2 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -33,8 +33,10 @@ public:
     std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
     bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
     bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
-    bool hasGPUDevice() override;
-    bool usingGPUDevice() override;
+    bool hasGPUDevice() const override;
+    bool usingGPUDevice() const override;
+    const char *backendName() const override;
+    const char *gpuDeviceName() const override;
 
     size_t embeddingSize() const override;
     // user-specified prefix
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index 4a873517..b4f22574 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -144,8 +144,10 @@ public:
         return false;
     }
 
-    virtual bool hasGPUDevice() { return false; }
-    virtual bool usingGPUDevice() { return false; }
+    virtual bool hasGPUDevice() const { return false; }
+    virtual bool usingGPUDevice() const { return false; }
+    virtual const char *backendName() const { return "cpu"; }
+    virtual const char *gpuDeviceName() const { return nullptr; }
 
     void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
 
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index aa02fee4..7fd8d5af 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -283,6 +283,18 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
 
 bool llmodel_has_gpu_device(llmodel_model model)
 {
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
+    const auto *wrapper = static_cast<LLModelWrapper *>(model);
     return wrapper->llModel->hasGPUDevice();
 }
+
+const char *llmodel_model_backend_name(llmodel_model model)
+{
+    const auto *wrapper = static_cast<LLModelWrapper *>(model);
+    return wrapper->llModel->backendName();
+}
+
+const char *llmodel_model_gpu_device_name(llmodel_model model)
+{
+    const auto *wrapper = static_cast<LLModelWrapper *>(model);
+    return wrapper->llModel->gpuDeviceName();
+}
diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
index 764f6ee9..35e08be1 100644
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -295,6 +295,16 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
  */
 bool llmodel_has_gpu_device(llmodel_model model);
 
+/**
+ * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
+ */
+const char *llmodel_model_backend_name(llmodel_model model);
+
+/**
+ * @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
+ */
+const char *llmodel_model_gpu_device_name(llmodel_model model);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
index c4bf8ae4..ce2122eb 100644
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -9,7 +9,7 @@ import sys
 import threading
 from enum import Enum
 from queue import Queue
-from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, NoReturn, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Literal, NoReturn, TypeVar, overload
 
 if sys.version_info >= (3, 9):
     import importlib.resources as importlib_resources
@@ -158,6 +158,12 @@ llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
 llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p]
 llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool
 
+llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p]
+llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
+
+llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p]
+llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p
+
 ResponseCallbackType = Callable[[int, str], bool]
 RawResponseCallbackType = Callable[[int, bytes], bool]
 EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
@@ -224,6 +230,19 @@ class LLModel:
     def _raise_closed(self) -> NoReturn:
         raise ValueError("Attempted operation on a closed LLModel")
 
+    @property
+    def backend(self) -> Literal["cpu", "kompute", "metal"]:
+        if self.model is None:
+            self._raise_closed()
+        return llmodel.llmodel_model_backend_name(self.model).decode()
+
+    @property
+    def device(self) -> str | None:
+        if self.model is None:
+            self._raise_closed()
+        dev = llmodel.llmodel_model_gpu_device_name(self.model)
+        return None if dev is None else dev.decode()
+
     @staticmethod
     def list_gpus(mem_required: int = 0) -> list[str]:
         """
@@ -333,22 +352,23 @@ class LLModel:
 
     @overload
     def generate_embeddings(
-        self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool, cancel_cb: EmbCancelCallbackType,
+        self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
+        cancel_cb: EmbCancelCallbackType | None,
     ) -> EmbedResult[list[float]]: ...
     @overload
     def generate_embeddings(
         self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType,
+        cancel_cb: EmbCancelCallbackType | None,
     ) -> EmbedResult[list[list[float]]]: ...
     @overload
     def generate_embeddings(
         self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType,
+        cancel_cb: EmbCancelCallbackType | None,
     ) -> EmbedResult[list[Any]]: ...
 
     def generate_embeddings(
         self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType,
+        cancel_cb: EmbCancelCallbackType | None,
     ) -> EmbedResult[list[Any]]:
         if not text:
             raise ValueError("text must not be None or empty")
@@ -368,11 +388,11 @@ class LLModel:
         for i, t in enumerate(text):
             c_texts[i] = t.encode()
 
-        def wrap_cancel_cb(batch_sizes: ctypes.POINTER(ctypes.c_uint), n_batch: int, backend: bytes) -> bool:
+        def wrap_cancel_cb(batch_sizes: Any, n_batch: int, backend: bytes) -> bool:
             assert cancel_cb is not None
             return cancel_cb(batch_sizes[:n_batch], backend.decode())
 
-        cancel_cb_wrapper = EmbCancelCallback(0x0 if cancel_cb is None else wrap_cancel_cb)
+        cancel_cb_wrapper = EmbCancelCallback() if cancel_cb is None else EmbCancelCallback(wrap_cancel_cb)
 
         # generate the embeddings
         embedding_ptr = llmodel.llmodel_embed(
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index 5ef81bf3..6424fc53 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -226,6 +226,16 @@ class GPT4All:
         """Delete the model instance and free associated system resources."""
         self.model.close()
 
+    @property
+    def backend(self) -> Literal["cpu", "kompute", "metal"]:
+        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
+        return self.model.backend
+
+    @property
+    def device(self) -> str | None:
+        """The name of the GPU device currently in use, or None for backends other than Kompute."""
+        return self.model.device
+
     @property
     def current_chat_session(self) -> list[MessageType] | None:
         return None if self._history is None else list(self._history)
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index c309250d..9e6a76ea 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -68,7 +68,7 @@ def get_long_description():
 
 setup(
     name=package_name,
-    version="2.5.2",
+    version="2.6.0",
     description="Python bindings for GPT4All",
     long_description=get_long_description(),
     long_description_content_type="text/markdown",