#include #include #ifdef __linux__ #include #endif #include "amdgpu.h" #include "gpu.h" #include "cpu.h" #include "overlay.h" #include "hud_elements.h" #include "logging.h" #include "mesa/util/macros.h" std::string metrics_path = ""; struct amdgpu_common_metrics amdgpu_common_metrics; std::mutex amdgpu_common_metrics_m; std::mutex amdgpu_m; std::condition_variable amdgpu_c; bool amdgpu_run_thread = true; std::unique_ptr throttling; bool amdgpu_verify_metrics(const std::string& path) { metrics_table_header header {}; FILE *f; f = fopen(path.c_str(), "rb"); if (!f) { SPDLOG_DEBUG("Failed to read the metrics header of '{}'", path); return false; } if (fread(&header, sizeof(header), 1, f) == 0) { SPDLOG_DEBUG("Failed to read the metrics header of '{}'", path); return false; } switch (header.format_revision) { case 1: // v1_1, v1_2, v1_3 if(header.content_revision<=0 || header.content_revision>3)// v1_0, not naturally aligned break; cpuStats.cpu_type = "GPU"; return true; case 2: // v2_1, v2_2, v2_3, v2_4 if(header.content_revision<=0 || header.content_revision>4)// v2_0, not naturally aligned break; cpuStats.cpu_type = "APU"; return true; default: break; } SPDLOG_WARN("Unsupported gpu_metrics version: {}.{}", header.format_revision, header.content_revision); return false; } #define IS_VALID_METRIC(FIELD) (FIELD != 0xffff) void amdgpu_get_instant_metrics(struct amdgpu_common_metrics *metrics) { FILE *f; void *buf[MAX(sizeof(struct gpu_metrics_v1_3), sizeof(struct gpu_metrics_v2_4))/sizeof(void*)+1]; struct metrics_table_header* header = (metrics_table_header*)buf; f = fopen(metrics_path.c_str(), "rb"); if (!f) return; // Read the whole file if (fread(buf, sizeof(buf), 1, f) != 0) { SPDLOG_DEBUG("amdgpu metrics file '{}' is larger than the buffer", metrics_path.c_str()); fclose(f); return; } fclose(f); int64_t indep_throttle_status = 0; if (header->format_revision == 1) { // Desktop GPUs struct gpu_metrics_v1_3 *amdgpu_metrics = (struct gpu_metrics_v1_3 *) buf; metrics->gpu_load_percent = amdgpu_metrics->average_gfx_activity; metrics->average_gfx_power_w = amdgpu_metrics->average_socket_power; metrics->current_gfxclk_mhz = amdgpu_metrics->current_gfxclk; metrics->current_uclk_mhz = amdgpu_metrics->current_uclk; metrics->gpu_temp_c = amdgpu_metrics->temperature_edge; indep_throttle_status = amdgpu_metrics->indep_throttle_status; metrics->fan_speed = amdgpu_metrics->current_fan_speed; } else if (header->format_revision == 2) { // APUs struct gpu_metrics_v2_3 *amdgpu_metrics = (struct gpu_metrics_v2_3 *) buf; metrics->gpu_load_percent = amdgpu_metrics->average_gfx_activity; metrics->average_gfx_power_w = amdgpu_metrics->average_gfx_power / 1000.f; if( IS_VALID_METRIC(amdgpu_metrics->average_cpu_power) ) { // prefered method metrics->average_cpu_power_w = amdgpu_metrics->average_cpu_power / 1000.f; } else if( IS_VALID_METRIC(amdgpu_metrics->average_core_power[0]) ) { // fallback 1: sum of core power metrics->average_cpu_power_w = 0; unsigned i = 0; do metrics->average_cpu_power_w = metrics->average_cpu_power_w + amdgpu_metrics->average_core_power[i] / 1000.f; while (++i < ARRAY_SIZE(amdgpu_metrics->average_core_power) && IS_VALID_METRIC(amdgpu_metrics->average_core_power[i])); } else if( IS_VALID_METRIC(amdgpu_metrics->average_socket_power) && IS_VALID_METRIC(amdgpu_metrics->average_gfx_power) ) { // fallback 2: estimate cpu power from total socket power metrics->average_cpu_power_w = amdgpu_metrics->average_socket_power / 1000.f - amdgpu_metrics->average_gfx_power / 1000.f; } else { // giving up metrics->average_cpu_power_w = 0; } if( IS_VALID_METRIC(amdgpu_metrics->current_gfxclk) ) { // prefered method metrics->current_gfxclk_mhz = amdgpu_metrics->current_gfxclk; } else if( IS_VALID_METRIC(amdgpu_metrics->average_gfxclk_frequency) ) { // fallback 1 metrics->current_gfxclk_mhz = amdgpu_metrics->average_gfxclk_frequency; } else { // giving up metrics->current_gfxclk_mhz = 0; } if( IS_VALID_METRIC(amdgpu_metrics->current_uclk) ) { // prefered method metrics->current_uclk_mhz = amdgpu_metrics->current_uclk; } else if( IS_VALID_METRIC(amdgpu_metrics->average_uclk_frequency) ) { // fallback 1 metrics->current_uclk_mhz = amdgpu_metrics->average_uclk_frequency; } else { // giving up metrics->current_uclk_mhz = 0; } if( IS_VALID_METRIC(amdgpu_metrics->temperature_soc) ) { // prefered method metrics->soc_temp_c = amdgpu_metrics->temperature_soc / 100; } else if( header->content_revision >= 3 && IS_VALID_METRIC(amdgpu_metrics->average_temperature_soc) ) { // fallback 1 metrics->soc_temp_c = amdgpu_metrics->average_temperature_soc / 100; } else { // giving up metrics->soc_temp_c = 0; } if( IS_VALID_METRIC(amdgpu_metrics->temperature_gfx) ) { // prefered method metrics->gpu_temp_c = amdgpu_metrics->temperature_gfx / 100; } else if( header->content_revision >= 3 && IS_VALID_METRIC(amdgpu_metrics->average_temperature_gfx) ) { // fallback 1 metrics->gpu_temp_c = amdgpu_metrics->average_temperature_gfx / 100; } else { // giving up metrics->gpu_temp_c = 0; } int cpu_temp = 0; if( IS_VALID_METRIC(amdgpu_metrics->temperature_core[0]) ) { // prefered method unsigned i = 0; do cpu_temp = MAX(cpu_temp, amdgpu_metrics->temperature_core[i]); while (++i < ARRAY_SIZE(amdgpu_metrics->temperature_core) && IS_VALID_METRIC(amdgpu_metrics->temperature_core[i])); metrics->apu_cpu_temp_c = cpu_temp / 100; } else if( header->content_revision >= 3 && IS_VALID_METRIC(amdgpu_metrics->average_temperature_core[0]) ) { // fallback 1 unsigned i = 0; do cpu_temp = MAX(cpu_temp, amdgpu_metrics->average_temperature_core[i]); while (++i < ARRAY_SIZE(amdgpu_metrics->average_temperature_core) && IS_VALID_METRIC(amdgpu_metrics->average_temperature_core[i])); metrics->apu_cpu_temp_c = cpu_temp / 100; } else if( cpuStats.ReadcpuTempFile(cpu_temp) ) { // fallback 2: Try temp from file 'm_cpuTempFile' of 'cpu.cpp' metrics->apu_cpu_temp_c = cpu_temp; } else { // giving up metrics->apu_cpu_temp_c = 0; } indep_throttle_status = amdgpu_metrics->indep_throttle_status; } /* Throttling: See https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h for the offsets */ metrics->is_power_throttled = ((indep_throttle_status >> 0) & 0xFF) != 0; metrics->is_current_throttled = ((indep_throttle_status >> 16) & 0xFF) != 0; metrics->is_temp_throttled = ((indep_throttle_status >> 32) & 0xFFFF) != 0; metrics->is_other_throttled = ((indep_throttle_status >> 56) & 0xFF) != 0; if (throttling) throttling->indep_throttle_status = indep_throttle_status; } void amdgpu_get_samples_and_copy(struct amdgpu_common_metrics metrics_buffer[METRICS_SAMPLE_COUNT], bool &gpu_load_needs_dividing) { // Get all the samples for (size_t cur_sample_id=0; cur_sample_id < METRICS_SAMPLE_COUNT; cur_sample_id++) { amdgpu_get_instant_metrics(&metrics_buffer[cur_sample_id]); // Detect and fix if the gpu load is reported in centipercent if (gpu_load_needs_dividing || metrics_buffer[cur_sample_id].gpu_load_percent > 100){ gpu_load_needs_dividing = true; metrics_buffer[cur_sample_id].gpu_load_percent /= 100; } usleep(METRICS_POLLING_PERIOD_MS * 1000); } // Copy the results from the different metrics to amdgpu_common_metrics amdgpu_common_metrics_m.lock(); UPDATE_METRIC_AVERAGE(gpu_load_percent); UPDATE_METRIC_AVERAGE_FLOAT(average_gfx_power_w); UPDATE_METRIC_AVERAGE_FLOAT(average_cpu_power_w); UPDATE_METRIC_AVERAGE(current_gfxclk_mhz); UPDATE_METRIC_AVERAGE(current_uclk_mhz); UPDATE_METRIC_AVERAGE(soc_temp_c); UPDATE_METRIC_AVERAGE(gpu_temp_c); UPDATE_METRIC_AVERAGE(apu_cpu_temp_c); UPDATE_METRIC_MAX(is_power_throttled); UPDATE_METRIC_MAX(is_current_throttled); UPDATE_METRIC_MAX(is_temp_throttled); UPDATE_METRIC_MAX(is_other_throttled); UPDATE_METRIC_MAX(fan_speed); amdgpu_common_metrics_m.unlock(); } void amdgpu_metrics_polling_thread() { struct amdgpu_common_metrics metrics_buffer[METRICS_SAMPLE_COUNT]; bool gpu_load_needs_dividing = false; //some GPUs report load as centipercent // Initial poll of the metrics, so that we have values to display as fast as possible amdgpu_get_instant_metrics(&amdgpu_common_metrics); if (amdgpu_common_metrics.gpu_load_percent > 100){ gpu_load_needs_dividing = true; amdgpu_common_metrics.gpu_load_percent /= 100; } // Set all the fields to 0 by default. Only done once as we're just replacing previous values after memset(metrics_buffer, 0, sizeof(metrics_buffer)); while (1) { std::unique_lock lock(amdgpu_m); amdgpu_c.wait(lock, []{return amdgpu_run_thread;}); lock.unlock(); #ifndef TEST_ONLY if (HUDElements.params->no_display && !logger->is_active()) usleep(100000); else #endif amdgpu_get_samples_and_copy(metrics_buffer, gpu_load_needs_dividing); } } void amdgpu_get_metrics(uint32_t deviceID){ static bool init = false; if (!init){ std::thread(amdgpu_metrics_polling_thread).detach(); init = true; } amdgpu_common_metrics_m.lock(); gpu_info.load = amdgpu_common_metrics.gpu_load_percent; gpu_info.powerUsage = amdgpu_common_metrics.average_gfx_power_w; gpu_info.MemClock = amdgpu_common_metrics.current_uclk_mhz; // Use hwmon instead, see gpu.cpp if ( deviceID == 0x1435 || deviceID == 0x163f ) { // If we are on VANGOGH (Steam Deck), then // always use use core clock from GPU metrics. gpu_info.CoreClock = amdgpu_common_metrics.current_gfxclk_mhz; } // gpu_info.temp = amdgpu_common_metrics.gpu_temp_c; gpu_info.apu_cpu_power = amdgpu_common_metrics.average_cpu_power_w; gpu_info.apu_cpu_temp = amdgpu_common_metrics.apu_cpu_temp_c; gpu_info.is_power_throttled = amdgpu_common_metrics.is_power_throttled; gpu_info.is_current_throttled = amdgpu_common_metrics.is_current_throttled; gpu_info.is_temp_throttled = amdgpu_common_metrics.is_temp_throttled; gpu_info.is_other_throttled = amdgpu_common_metrics.is_other_throttled; gpu_info.fan_speed = amdgpu_common_metrics.fan_speed; amdgpu_common_metrics_m.unlock(); }