From ba53ab5da0931b61cf1771aec27a3e390b36575c Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 18 Apr 2024 14:52:02 -0400 Subject: [PATCH] python: do not print GPU name with verbose=False, expose this info via properties (#2222) * llamamodel: only print device used in verbose mode Signed-off-by: Jared Van Bortel * python: expose backend and device via GPT4All properties Signed-off-by: Jared Van Bortel * backend: const correctness fixes Signed-off-by: Jared Van Bortel * python: bump version Signed-off-by: Jared Van Bortel * python: typing fixups Signed-off-by: Jared Van Bortel * python: fix segfault with closed GPT4All Signed-off-by: Jared Van Bortel --------- Signed-off-by: Jared Van Bortel --- gpt4all-backend/llamamodel.cpp | 27 ++++++++++++--- gpt4all-backend/llamamodel_impl.h | 6 ++-- gpt4all-backend/llmodel.h | 6 ++-- gpt4all-backend/llmodel_c.cpp | 14 +++++++- gpt4all-backend/llmodel_c.h | 10 ++++++ gpt4all-bindings/python/gpt4all/_pyllmodel.py | 34 +++++++++++++++---- gpt4all-bindings/python/gpt4all/gpt4all.py | 10 ++++++ gpt4all-bindings/python/setup.py | 2 +- 8 files changed, 91 insertions(+), 18 deletions(-) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index dee528ee..15997e7f 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -364,8 +364,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) d_ptr->end_tokens = {llama_token_eos(d_ptr->model)}; #ifdef GGML_USE_KOMPUTE - if (usingGPUDevice() && ggml_vk_has_device()) { - std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl; + if (usingGPUDevice()) { + if (llama_verbose()) { + std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl; + } d_ptr->backend_name = "kompute"; } #endif @@ -558,7 +560,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co #endif } -bool LLamaModel::hasGPUDevice() +bool LLamaModel::hasGPUDevice() const { #if defined(GGML_USE_KOMPUTE) return d_ptr->device != -1; @@ -567,10 +569,12 @@ bool LLamaModel::hasGPUDevice() #endif } -bool LLamaModel::usingGPUDevice() +bool LLamaModel::usingGPUDevice() const { #if defined(GGML_USE_KOMPUTE) - return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0; + bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0; + assert(!hasDevice || ggml_vk_has_device()); + return hasDevice; #elif defined(GGML_USE_METAL) return true; #else @@ -578,6 +582,19 @@ bool LLamaModel::usingGPUDevice() #endif } +const char *LLamaModel::backendName() const { + return d_ptr->backend_name; +} + +const char *LLamaModel::gpuDeviceName() const { +#if defined(GGML_USE_KOMPUTE) + if (usingGPUDevice()) { + return ggml_vk_current_device().name; + } +#endif + return nullptr; +} + void llama_batch_add( struct llama_batch & batch, llama_token id, diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index f4c1a2e6..d36dc6d2 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -33,8 +33,10 @@ public: std::vector availableGPUDevices(size_t memoryRequired) const override; bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override; bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override; - bool hasGPUDevice() override; - bool usingGPUDevice() override; + bool hasGPUDevice() const override; + bool usingGPUDevice() const override; + const char *backendName() const override; + const char *gpuDeviceName() const override; size_t embeddingSize() const override; // user-specified prefix diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 4a873517..b4f22574 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -144,8 +144,10 @@ public: return false; } - virtual bool hasGPUDevice() { return false; } - virtual bool usingGPUDevice() { return false; } + virtual bool hasGPUDevice() const { return false; } + virtual bool usingGPUDevice() const { return false; } + virtual const char *backendName() const { return "cpu"; } + virtual const char *gpuDeviceName() const { return nullptr; } void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; } diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index aa02fee4..7fd8d5af 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -283,6 +283,18 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device) bool llmodel_has_gpu_device(llmodel_model model) { - auto *wrapper = static_cast(model); + const auto *wrapper = static_cast(model); return wrapper->llModel->hasGPUDevice(); } + +const char *llmodel_model_backend_name(llmodel_model model) +{ + const auto *wrapper = static_cast(model); + return wrapper->llModel->backendName(); +} + +const char *llmodel_model_gpu_device_name(llmodel_model model) +{ + const auto *wrapper = static_cast(model); + return wrapper->llModel->gpuDeviceName(); +} diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h index 764f6ee9..35e08be1 100644 --- a/gpt4all-backend/llmodel_c.h +++ b/gpt4all-backend/llmodel_c.h @@ -295,6 +295,16 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device); */ bool llmodel_has_gpu_device(llmodel_model model); +/** + * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal". + */ +const char *llmodel_model_backend_name(llmodel_model model); + +/** + * @return The name of the GPU device currently in use, or NULL for backends other than Kompute. + */ +const char *llmodel_model_gpu_device_name(llmodel_model model); + #ifdef __cplusplus } #endif diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py index c4bf8ae4..ce2122eb 100644 --- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py @@ -9,7 +9,7 @@ import sys import threading from enum import Enum from queue import Queue -from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, NoReturn, TypeVar, overload +from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Literal, NoReturn, TypeVar, overload if sys.version_info >= (3, 9): import importlib.resources as importlib_resources @@ -158,6 +158,12 @@ llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p] llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool +llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p] +llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p + +llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p] +llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p + ResponseCallbackType = Callable[[int, str], bool] RawResponseCallbackType = Callable[[int, bytes], bool] EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]' @@ -224,6 +230,19 @@ class LLModel: def _raise_closed(self) -> NoReturn: raise ValueError("Attempted operation on a closed LLModel") + @property + def backend(self) -> Literal["cpu", "kompute", "metal"]: + if self.model is None: + self._raise_closed() + return llmodel.llmodel_model_backend_name(self.model).decode() + + @property + def device(self) -> str | None: + if self.model is None: + self._raise_closed() + dev = llmodel.llmodel_model_gpu_device_name(self.model) + return None if dev is None else dev.decode() + @staticmethod def list_gpus(mem_required: int = 0) -> list[str]: """ @@ -333,22 +352,23 @@ class LLModel: @overload def generate_embeddings( - self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool, cancel_cb: EmbCancelCallbackType, + self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, + cancel_cb: EmbCancelCallbackType | None, ) -> EmbedResult[list[float]]: ... @overload def generate_embeddings( self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, - cancel_cb: EmbCancelCallbackType, + cancel_cb: EmbCancelCallbackType | None, ) -> EmbedResult[list[list[float]]]: ... @overload def generate_embeddings( self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, - cancel_cb: EmbCancelCallbackType, + cancel_cb: EmbCancelCallbackType | None, ) -> EmbedResult[list[Any]]: ... def generate_embeddings( self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, - cancel_cb: EmbCancelCallbackType, + cancel_cb: EmbCancelCallbackType | None, ) -> EmbedResult[list[Any]]: if not text: raise ValueError("text must not be None or empty") @@ -368,11 +388,11 @@ class LLModel: for i, t in enumerate(text): c_texts[i] = t.encode() - def wrap_cancel_cb(batch_sizes: ctypes.POINTER(ctypes.c_uint), n_batch: int, backend: bytes) -> bool: + def wrap_cancel_cb(batch_sizes: Any, n_batch: int, backend: bytes) -> bool: assert cancel_cb is not None return cancel_cb(batch_sizes[:n_batch], backend.decode()) - cancel_cb_wrapper = EmbCancelCallback(0x0 if cancel_cb is None else wrap_cancel_cb) + cancel_cb_wrapper = EmbCancelCallback() if cancel_cb is None else EmbCancelCallback(wrap_cancel_cb) # generate the embeddings embedding_ptr = llmodel.llmodel_embed( diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py index 5ef81bf3..6424fc53 100644 --- a/gpt4all-bindings/python/gpt4all/gpt4all.py +++ b/gpt4all-bindings/python/gpt4all/gpt4all.py @@ -226,6 +226,16 @@ class GPT4All: """Delete the model instance and free associated system resources.""" self.model.close() + @property + def backend(self) -> Literal["cpu", "kompute", "metal"]: + """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".""" + return self.model.backend + + @property + def device(self) -> str | None: + """The name of the GPU device currently in use, or None for backends other than Kompute.""" + return self.model.device + @property def current_chat_session(self) -> list[MessageType] | None: return None if self._history is None else list(self._history) diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py index c309250d..9e6a76ea 100644 --- a/gpt4all-bindings/python/setup.py +++ b/gpt4all-bindings/python/setup.py @@ -68,7 +68,7 @@ def get_long_description(): setup( name=package_name, - version="2.5.2", + version="2.6.0", description="Python bindings for GPT4All", long_description=get_long_description(), long_description_content_type="text/markdown",