diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp index 8ef941ce..a87fbf80 100644 --- a/gpt4all-backend/llmodel.cpp +++ b/gpt4all-backend/llmodel.cpp @@ -213,9 +213,9 @@ LLModel *LLModel::Implementation::constructDefaultLlama() { return llama.get(); } -std::vector LLModel::Implementation::availableGPUDevices() { +std::vector LLModel::Implementation::availableGPUDevices(size_t memoryRequired) { auto *llama = constructDefaultLlama(); - if (llama) { return llama->availableGPUDevices(0); } + if (llama) { return llama->availableGPUDevices(memoryRequired); } return {}; } diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 17abb80f..2243c087 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -38,7 +38,7 @@ public: std::string_view buildVariant() const { return m_buildVariant; } static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048); - static std::vector availableGPUDevices(); + static std::vector availableGPUDevices(size_t memoryRequired = 0); static int32_t maxContextLength(const std::string &modelPath); static int32_t layerCount(const std::string &modelPath); static bool isEmbeddingModel(const std::string &modelPath); diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index 950a7320..a046ac76 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -221,28 +222,45 @@ const char *llmodel_get_implementation_search_path() return LLModel::Implementation::implementationsSearchPath().c_str(); } -struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices) -{ - auto *wrapper = static_cast(model); - std::vector devices = wrapper->llModel->availableGPUDevices(memoryRequired); +// RAII wrapper around a C-style struct +struct llmodel_gpu_device_cpp: llmodel_gpu_device { + llmodel_gpu_device_cpp() = default; - // Set the num_devices + llmodel_gpu_device_cpp(const llmodel_gpu_device_cpp &) = delete; + llmodel_gpu_device_cpp( llmodel_gpu_device_cpp &&) = delete; + + const llmodel_gpu_device_cpp &operator=(const llmodel_gpu_device_cpp &) = delete; + llmodel_gpu_device_cpp &operator=( llmodel_gpu_device_cpp &&) = delete; + + ~llmodel_gpu_device_cpp() { + free(const_cast(name)); + free(const_cast(vendor)); + } +}; + +static_assert(sizeof(llmodel_gpu_device_cpp) == sizeof(llmodel_gpu_device)); + +struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, int *num_devices) +{ + static thread_local std::unique_ptr c_devices; + + auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired); *num_devices = devices.size(); - if (*num_devices == 0) return nullptr; // Return nullptr if no devices are found + if (devices.empty()) { return nullptr; /* no devices */ } - // Allocate memory for the output array - struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device)); - - for (int i = 0; i < *num_devices; i++) { - output[i].index = devices[i].index; - output[i].type = devices[i].type; - output[i].heapSize = devices[i].heapSize; - output[i].name = strdup(devices[i].name.c_str()); // Convert std::string to char* and allocate memory - output[i].vendor = strdup(devices[i].vendor.c_str()); // Convert std::string to char* and allocate memory + c_devices = std::make_unique(devices.size()); + for (unsigned i = 0; i < devices.size(); i++) { + const auto &dev = devices[i]; + auto &cdev = c_devices[i]; + cdev.index = dev.index; + cdev.type = dev.type; + cdev.heapSize = dev.heapSize; + cdev.name = strdup(dev.name.c_str()); + cdev.vendor = strdup(dev.vendor.c_str()); } - return output; + return c_devices.get(); } bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device) diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h index e26722ca..f7a54734 100644 --- a/gpt4all-backend/llmodel_c.h +++ b/gpt4all-backend/llmodel_c.h @@ -48,9 +48,9 @@ struct llmodel_prompt_context { }; struct llmodel_gpu_device { - int index = 0; - int type = 0; // same as VkPhysicalDeviceType - size_t heapSize = 0; + int index; + int type; // same as VkPhysicalDeviceType + size_t heapSize; const char * name; const char * vendor; }; @@ -241,9 +241,10 @@ const char *llmodel_get_implementation_search_path(); /** * Get a list of available GPU devices given the memory required. + * @param memoryRequired The minimum amount of VRAM, in bytes * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices. */ -struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices); +struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices); /** * Initializes a GPU device based on a specified string criterion. diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py index 1c50d0aa..fc1ac6b0 100644 --- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py @@ -138,7 +138,7 @@ llmodel.llmodel_threadCount.restype = ctypes.c_int32 llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).encode()) -llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)] +llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)] llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice) llmodel.llmodel_gpu_init_gpu_device_by_string.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p] @@ -214,13 +214,22 @@ class LLModel: def _raise_closed(self) -> NoReturn: raise ValueError("Attempted operation on a closed LLModel") - def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]: - assert self.model is not None + @staticmethod + def list_gpus(mem_required: int = 0) -> list[str]: + """ + List the names of the available GPU devices with at least `mem_required` bytes of VRAM. + + Args: + mem_required: The minimum amount of VRAM, in bytes + + Returns: + A list of strings representing the names of the available GPU devices. + """ num_devices = ctypes.c_int32(0) - devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices)) + devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices)) if not devices_ptr: raise ValueError("Unable to retrieve available GPU devices") - return devices_ptr[:num_devices.value] + return [d.name.decode() for d in devices_ptr[:num_devices.value]] def init_gpu(self, device: str): if self.model is None: @@ -231,23 +240,13 @@ class LLModel: if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()): return - # Retrieve all GPUs without considering memory requirements. - num_devices = ctypes.c_int32(0) - all_devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices)) - if not all_devices_ptr: - raise ValueError("Unable to retrieve list of all GPU devices") - all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]] - - # Retrieve GPUs that meet the memory requirements using list_gpu - available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)] - - # Identify GPUs that are unavailable due to insufficient memory or features + all_gpus = self.list_gpus() + available_gpus = self.list_gpus(mem_required) unavailable_gpus = set(all_gpus).difference(available_gpus) - # Formulate the error message - error_msg = "Unable to initialize model on GPU: '{}'.".format(device) - error_msg += "\nAvailable GPUs: {}.".format(available_gpus) - error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus) + error_msg = "Unable to initialize model on GPU: {!r}".format(device) + error_msg += "\nAvailable GPUs: {}".format(available_gpus) + error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}".format(unavailable_gpus) raise ValueError(error_msg) def load_model(self) -> bool: diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py index b7c11b07..5fef9e5b 100644 --- a/gpt4all-bindings/python/gpt4all/gpt4all.py +++ b/gpt4all-bindings/python/gpt4all/gpt4all.py @@ -19,8 +19,7 @@ from requests.exceptions import ChunkedEncodingError from tqdm import tqdm from urllib3.exceptions import IncompleteRead, ProtocolError -from . import _pyllmodel -from ._pyllmodel import EmbedResult as EmbedResult +from ._pyllmodel import EmbedResult as EmbedResult, LLModel, ResponseCallbackType, empty_response_callback if TYPE_CHECKING: from typing_extensions import Self, TypeAlias @@ -44,16 +43,18 @@ class Embed4All: MIN_DIMENSIONALITY = 64 - def __init__(self, model_name: str | None = None, n_threads: int | None = None, **kwargs): + def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any): """ Constructor Args: n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically. + device: The processing unit on which the embedding model will run. See the `GPT4All` constructor for more info. + kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor. """ if model_name is None: model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf' - self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs) + self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs) def __enter__(self) -> Self: return self @@ -157,6 +158,7 @@ class GPT4All: def __init__( self, model_name: str, + *, model_path: str | os.PathLike[str] | None = None, model_type: str | None = None, allow_download: bool = True, @@ -181,7 +183,7 @@ class GPT4All: - "cpu": Model will run on the central processing unit. - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor. - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor. - Alternatively, a specific GPU name can also be provided, and the model will run on the GPU that matches the name if it's available. + - A specific device name from the list returned by `GPT4All.list_gpus()`. Default is "cpu". Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model. @@ -192,7 +194,7 @@ class GPT4All: self.model_type = model_type # Retrieve model and download if allowed self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose) - self.model = _pyllmodel.LLModel(self.config["path"], n_ctx, ngl) + self.model = LLModel(self.config["path"], n_ctx, ngl) if device is not None and device != "cpu": self.model.init_gpu(device) self.model.load_model() @@ -419,19 +421,19 @@ class GPT4All: def generate( self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ..., min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ..., - n_predict: int | None = ..., streaming: Literal[False] = ..., callback: _pyllmodel.ResponseCallbackType = ..., + n_predict: int | None = ..., streaming: Literal[False] = ..., callback: ResponseCallbackType = ..., ) -> str: ... @overload def generate( self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ..., min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ..., - n_predict: int | None = ..., streaming: Literal[True], callback: _pyllmodel.ResponseCallbackType = ..., + n_predict: int | None = ..., streaming: Literal[True], callback: ResponseCallbackType = ..., ) -> Iterable[str]: ... @overload def generate( self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ..., min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ..., - n_predict: int | None = ..., streaming: bool, callback: _pyllmodel.ResponseCallbackType = ..., + n_predict: int | None = ..., streaming: bool, callback: ResponseCallbackType = ..., ) -> Any: ... def generate( @@ -448,7 +450,7 @@ class GPT4All: n_batch: int = 8, n_predict: int | None = None, streaming: bool = False, - callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback, + callback: ResponseCallbackType = empty_response_callback, ) -> Any: """ Generate outputs from any GPT4All model. @@ -494,7 +496,7 @@ class GPT4All: if reset: # ingest system prompt self.model.prompt_model(self._history[0]["content"], "%1", - _pyllmodel.empty_response_callback, + empty_response_callback, n_batch=n_batch, n_predict=0, special=True) prompt_template = self._current_prompt_template.format("%1", "%2") else: @@ -523,9 +525,9 @@ class GPT4All: output_collector = self._history def _callback_wrapper( - callback: _pyllmodel.ResponseCallbackType, + callback: ResponseCallbackType, output_collector: list[MessageType], - ) -> _pyllmodel.ResponseCallbackType: + ) -> ResponseCallbackType: def _callback(token_id: int, response: str) -> bool: nonlocal callback, output_collector @@ -589,6 +591,16 @@ class GPT4All: self._history = None self._current_prompt_template = "{0}" + @staticmethod + def list_gpus() -> list[str]: + """ + List the names of the available GPU devices. + + Returns: + A list of strings representing the names of the available GPU devices. + """ + return LLModel.list_gpus() + def _format_chat_prompt_template( self, messages: list[MessageType], @@ -598,6 +610,9 @@ class GPT4All: """ Helper method for building a prompt from list of messages using the self._current_prompt_template as a template for each message. + Warning: + This function was deprecated in version 2.3.0, and will be removed in a future release. + Args: messages: List of dictionaries. Each dictionary should have a "role" key with value of "system", "assistant", or "user" and a "content" key with a diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py index c8b50e86..fc44b256 100644 --- a/gpt4all-bindings/python/setup.py +++ b/gpt4all-bindings/python/setup.py @@ -68,7 +68,7 @@ def get_long_description(): setup( name=package_name, - version="2.3.3", + version="2.4.0", description="Python bindings for GPT4All", long_description=get_long_description(), long_description_content_type="text/markdown", diff --git a/gpt4all-bindings/typescript/index.cc b/gpt4all-bindings/typescript/index.cc index 0aca7755..8a4349ae 100644 --- a/gpt4all-bindings/typescript/index.cc +++ b/gpt4all-bindings/typescript/index.cc @@ -36,7 +36,7 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info) auto env = info.Env(); int num_devices = 0; auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers); - llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(GetInference(), mem_size, &num_devices); + llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices); if (all_devices == nullptr) { Napi::Error::New(env, "Unable to retrieve list of all GPU devices").ThrowAsJavaScriptException();