Files
nvtop/src/extract_gpuinfo_amdgpu.c
Maxime Schmitt f61ee5019c Merge pull request #272 from hmaarrfk/patch-2
Remove import of kcmp
2024-02-26 17:31:43 +01:00

1013 lines
38 KiB
C

/*
* Copyright (C) 2012 Lauri Kasanen
* Copyright (C) 2018 Genesis Cloud Ltd.
* Copyright (C) 2022 YiFei Zhu <zhuyifei1999@gmail.com>
* Copyright (C) 2022 Maxime Schmitt <maxime.schmitt91@gmail.com>
* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*
* This file is part of Nvtop and adapted from radeontop.
*
* Nvtop is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Nvtop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with nvtop. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "nvtop/common.h"
#include "nvtop/device_discovery.h"
#include "nvtop/extract_gpuinfo_common.h"
#include "nvtop/extract_processinfo_fdinfo.h"
#include "nvtop/time.h"
#include <assert.h>
#include <ctype.h>
#include <dirent.h>
#include <dlfcn.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <libdrm/amdgpu.h>
#include <libdrm/amdgpu_drm.h>
#include <math.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <unistd.h>
#include <uthash.h>
#include <xf86drm.h>
// extern
const char *amdgpu_parse_marketing_name(struct amdgpu_gpu_info *info);
// Local function pointers to DRM interface
static typeof(drmGetDevices) *_drmGetDevices;
static typeof(drmGetDevices2) *_drmGetDevices2;
static typeof(drmFreeDevices) *_drmFreeDevices;
static typeof(drmGetVersion) *_drmGetVersion;
static typeof(drmFreeVersion) *_drmFreeVersion;
static typeof(drmGetMagic) *_drmGetMagic;
static typeof(drmAuthMagic) *_drmAuthMagic;
static typeof(drmDropMaster) *_drmDropMaster;
// Local function pointers to amdgpu DRM interface
static typeof(amdgpu_device_initialize) *_amdgpu_device_initialize;
static typeof(amdgpu_device_deinitialize) *_amdgpu_device_deinitialize;
static typeof(amdgpu_get_marketing_name) *_amdgpu_get_marketing_name;
static typeof(amdgpu_query_hw_ip_info) *_amdgpu_query_hw_ip_info;
static typeof(amdgpu_query_gpu_info) *_amdgpu_query_gpu_info;
static typeof(amdgpu_query_info) *_amdgpu_query_info;
static typeof(amdgpu_query_sensor_info) *_amdgpu_query_sensor_info;
static void *libdrm_handle;
static void *libdrm_amdgpu_handle;
static int last_libdrm_return_status = 0;
static char didnt_call_gpuinfo_init[] = "uninitialized";
static const char *local_error_string = didnt_call_gpuinfo_init;
#define HASH_FIND_CLIENT(head, key_ptr, out_ptr) HASH_FIND(hh, head, key_ptr, sizeof(struct unique_cache_id), out_ptr)
#define HASH_ADD_CLIENT(head, in_ptr) HASH_ADD(hh, head, client_id, sizeof(struct unique_cache_id), in_ptr)
#define SET_AMDGPU_CACHE(cachePtr, field, value) SET_VALUE(cachePtr, field, value, amdgpu_cache_)
#define RESET_AMDGPU_CACHE(cachePtr, field) INVALIDATE_VALUE(cachePtr, field, amdgpu_cache_)
#define AMDGPU_CACHE_FIELD_VALID(cachePtr, field) VALUE_IS_VALID(cachePtr, field, amdgpu_cache_)
enum amdgpu_process_info_cache_valid {
amdgpu_cache_gfx_engine_used_valid = 0,
amdgpu_cache_compute_engine_used_valid,
amdgpu_cache_enc_engine_used_valid,
amdgpu_cache_dec_engine_used_valid,
amdgpu_cache_process_info_cache_valid_count
};
struct __attribute__((__packed__)) unique_cache_id {
unsigned client_id;
pid_t pid;
char *pdev;
};
struct amdgpu_process_info_cache {
struct unique_cache_id client_id;
uint64_t gfx_engine_used;
uint64_t compute_engine_used;
uint64_t enc_engine_used;
uint64_t dec_engine_used;
nvtop_time last_measurement_tstamp;
unsigned char valid[(amdgpu_cache_process_info_cache_valid_count + CHAR_BIT - 1) / CHAR_BIT];
UT_hash_handle hh;
};
struct gpu_info_amdgpu {
struct gpu_info base;
drmVersionPtr drmVersion;
int fd;
amdgpu_device_handle amdgpu_device;
// We poll the fan frequently enough and want to avoid the open/close overhead of the sysfs file
FILE *fanSpeedFILE; // FILE* for this device current fan speed
FILE *PCIeBW; // FILE* for this device PCIe bandwidth over one second
FILE *powerCap; // FILE* for this device power cap
nvtop_device *amdgpuDevice; // The AMDGPU driver device
nvtop_device *hwmonDevice; // The AMDGPU driver hwmon device
struct amdgpu_process_info_cache *last_update_process_cache, *current_update_process_cache; // Cached processes info
// Used to compute the actual fan speed
unsigned maxFanValue;
};
unsigned amdgpu_count;
static struct gpu_info_amdgpu *gpu_infos;
static bool gpuinfo_amdgpu_init(void);
static void gpuinfo_amdgpu_shutdown(void);
static const char *gpuinfo_amdgpu_last_error_string(void);
static bool gpuinfo_amdgpu_get_device_handles(struct list_head *devices, unsigned *count);
static void gpuinfo_amdgpu_populate_static_info(struct gpu_info *_gpu_info);
static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info);
static void gpuinfo_amdgpu_get_running_processes(struct gpu_info *_gpu_info);
struct gpu_vendor gpu_vendor_amdgpu = {
.init = gpuinfo_amdgpu_init,
.shutdown = gpuinfo_amdgpu_shutdown,
.last_error_string = gpuinfo_amdgpu_last_error_string,
.get_device_handles = gpuinfo_amdgpu_get_device_handles,
.populate_static_info = gpuinfo_amdgpu_populate_static_info,
.refresh_dynamic_info = gpuinfo_amdgpu_refresh_dynamic_info,
.refresh_running_processes = gpuinfo_amdgpu_get_running_processes,
.name = "AMD",
};
static int readAttributeFromDevice(nvtop_device *dev, const char *sysAttr, const char *format, ...);
__attribute__((constructor)) static void init_extract_gpuinfo_amdgpu(void) { register_gpu_vendor(&gpu_vendor_amdgpu); }
static int wrap_drmGetDevices(drmDevicePtr devices[], int max_devices) {
assert(_drmGetDevices2 || _drmGetDevices);
if (_drmGetDevices2)
return _drmGetDevices2(0, devices, max_devices);
return _drmGetDevices(devices, max_devices);
}
static bool parse_drm_fdinfo_amd(struct gpu_info *info, FILE *fdinfo_file, struct gpu_process *process_info);
static bool gpuinfo_amdgpu_init(void) {
libdrm_handle = dlopen("libdrm.so", RTLD_LAZY);
if (!libdrm_handle)
libdrm_handle = dlopen("libdrm.so.2", RTLD_LAZY);
if (!libdrm_handle)
libdrm_handle = dlopen("libdrm.so.1", RTLD_LAZY);
if (!libdrm_handle) {
local_error_string = dlerror();
return false;
}
_drmGetDevices2 = dlsym(libdrm_handle, "drmGetDevices2");
if (!_drmGetDevices2)
_drmGetDevices = dlsym(libdrm_handle, "drmGetDevices");
if (!_drmGetDevices2 && !_drmGetDevices)
goto init_error_clean_exit;
_drmFreeDevices = dlsym(libdrm_handle, "drmFreeDevices");
if (!_drmFreeDevices)
goto init_error_clean_exit;
_drmGetVersion = dlsym(libdrm_handle, "drmGetVersion");
if (!_drmGetVersion)
goto init_error_clean_exit;
_drmFreeVersion = dlsym(libdrm_handle, "drmFreeVersion");
if (!_drmFreeVersion)
goto init_error_clean_exit;
_drmGetMagic = dlsym(libdrm_handle, "drmGetMagic");
if (!_drmGetMagic)
goto init_error_clean_exit;
_drmAuthMagic = dlsym(libdrm_handle, "drmAuthMagic");
if (!_drmAuthMagic)
goto init_error_clean_exit;
_drmDropMaster = dlsym(libdrm_handle, "drmDropMaster");
if (!_drmDropMaster)
goto init_error_clean_exit;
libdrm_amdgpu_handle = dlopen("libdrm_amdgpu.so", RTLD_LAZY);
if (!libdrm_amdgpu_handle)
libdrm_amdgpu_handle = dlopen("libdrm_amdgpu.so.1", RTLD_LAZY);
if (libdrm_amdgpu_handle) {
_amdgpu_device_initialize = dlsym(libdrm_amdgpu_handle, "amdgpu_device_initialize");
_amdgpu_device_deinitialize = dlsym(libdrm_amdgpu_handle, "amdgpu_device_deinitialize");
_amdgpu_get_marketing_name = dlsym(libdrm_amdgpu_handle, "amdgpu_get_marketing_name");
_amdgpu_query_hw_ip_info = dlsym(libdrm_amdgpu_handle, "amdgpu_query_hw_ip_info");
_amdgpu_query_info = dlsym(libdrm_amdgpu_handle, "amdgpu_query_info");
_amdgpu_query_gpu_info = dlsym(libdrm_amdgpu_handle, "amdgpu_query_gpu_info");
_amdgpu_query_sensor_info = dlsym(libdrm_amdgpu_handle, "amdgpu_query_sensor_info");
}
local_error_string = NULL;
return true;
init_error_clean_exit:
dlclose(libdrm_handle);
libdrm_handle = NULL;
return false;
}
static void gpuinfo_amdgpu_shutdown(void) {
for (unsigned i = 0; i < amdgpu_count; ++i) {
struct gpu_info_amdgpu *gpu_info = &gpu_infos[i];
if (gpu_info->fanSpeedFILE)
fclose(gpu_info->fanSpeedFILE);
if (gpu_info->PCIeBW)
fclose(gpu_info->PCIeBW);
if (gpu_info->powerCap)
fclose(gpu_info->powerCap);
nvtop_device_unref(gpu_info->amdgpuDevice);
nvtop_device_unref(gpu_info->hwmonDevice);
_drmFreeVersion(gpu_info->drmVersion);
_amdgpu_device_deinitialize(gpu_info->amdgpu_device);
// Clean the process cache
struct amdgpu_process_info_cache *cache_entry, *cache_tmp;
HASH_ITER(hh, gpu_info->last_update_process_cache, cache_entry, cache_tmp) {
HASH_DEL(gpu_info->last_update_process_cache, cache_entry);
free(cache_entry);
}
}
free(gpu_infos);
gpu_infos = NULL;
amdgpu_count = 0;
if (libdrm_handle) {
dlclose(libdrm_handle);
libdrm_handle = NULL;
local_error_string = didnt_call_gpuinfo_init;
}
if (libdrm_amdgpu_handle) {
dlclose(libdrm_amdgpu_handle);
libdrm_amdgpu_handle = NULL;
}
}
static const char *gpuinfo_amdgpu_last_error_string(void) {
if (local_error_string) {
return local_error_string;
} else if (last_libdrm_return_status < 0) {
switch (last_libdrm_return_status) {
case DRM_ERR_NO_DEVICE:
return "no device\n";
case DRM_ERR_NO_ACCESS:
return "no access\n";
case DRM_ERR_NOT_ROOT:
return "not root\n";
case DRM_ERR_INVALID:
return "invalid args\n";
case DRM_ERR_NO_FD:
return "no fd\n";
default:
return "unknown error\n";
}
} else {
return "An unanticipated error occurred while accessing AMDGPU "
"information\n";
}
}
static void authenticate_drm(int fd) {
drm_magic_t magic;
if (_drmGetMagic(fd, &magic) < 0) {
return;
}
if (_drmAuthMagic(fd, magic) == 0) {
if (_drmDropMaster(fd)) {
perror("Failed to drop DRM master");
fprintf(
stderr,
"\nWARNING: other DRM clients will crash on VT switch while nvtop is running!\npress ENTER to continue\n");
fgetc(stdin);
}
return;
}
// XXX: Ideally I'd implement this too, but I'd need to pull in libxcb and yet
// more functions and structs that may break ABI compatibility.
// See radeontop auth_xcb.c for what is involved here
fprintf(stderr, "Failed to authenticate to DRM; XCB authentication unimplemented\n");
}
static void initDeviceSysfsPaths(struct gpu_info_amdgpu *gpu_info) {
// Open the device sys folder to gather information not available through the DRM driver
char devicePath[22 + PDEV_LEN];
snprintf(devicePath, sizeof(devicePath), "/sys/bus/pci/devices/%s", gpu_info->base.pdev);
nvtop_device_new_from_syspath(&gpu_info->amdgpuDevice, devicePath);
assert(gpu_info->amdgpuDevice != NULL);
int sysfsFD = open(devicePath, O_RDONLY);
gpu_info->hwmonDevice = nvtop_device_get_hwmon(gpu_info->amdgpuDevice);
assert(gpu_info->hwmonDevice != NULL);
// Open the device hwmon folder (Fan speed are available there)
const char *hwmonPath;
nvtop_device_get_syspath(gpu_info->hwmonDevice, &hwmonPath);
int hwmonFD = open(hwmonPath, O_RDONLY);
// Look for which fan to use (PWM or RPM)
gpu_info->fanSpeedFILE = NULL;
unsigned pwmIsEnabled;
int NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "pwm1_enable", "%u", &pwmIsEnabled);
bool usePWMSensor = NreadPatterns == 1 && pwmIsEnabled > 0;
bool useRPMSensor = false;
if (!usePWMSensor) {
unsigned rpmIsEnabled;
NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "fan1_enable", "%u", &rpmIsEnabled);
useRPMSensor = NreadPatterns && rpmIsEnabled > 0;
}
// Either RPM or PWM or neither
assert((useRPMSensor ^ usePWMSensor) || (!useRPMSensor && !usePWMSensor));
if (usePWMSensor || useRPMSensor) {
char *maxFanSpeedFile = usePWMSensor ? "pwm1_max" : "fan1_max";
char *fanSensorFile = usePWMSensor ? "pwm1" : "fan1_input";
unsigned maxSpeedVal;
NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, maxFanSpeedFile, "%u", &maxSpeedVal);
if (NreadPatterns == 1) {
gpu_info->maxFanValue = maxSpeedVal;
// Open the fan file for dynamic info gathering
int fanSpeedFD = openat(hwmonFD, fanSensorFile, O_RDONLY);
if (fanSpeedFD >= 0) {
gpu_info->fanSpeedFILE = fdopen(fanSpeedFD, "r");
if (!gpu_info->fanSpeedFILE)
close(fanSpeedFD);
}
}
}
// Open the PCIe bandwidth file for dynamic info gathering
gpu_info->PCIeBW = NULL;
int pcieBWFD = openat(sysfsFD, "pcie_bw", O_RDONLY);
if (pcieBWFD) {
gpu_info->PCIeBW = fdopen(pcieBWFD, "r");
}
// Open the power cap file for dynamic info gathering
gpu_info->powerCap = NULL;
int powerCapFD = openat(hwmonFD, "power1_cap", O_RDONLY);
if (powerCapFD) {
gpu_info->powerCap = fdopen(powerCapFD, "r");
}
close(hwmonFD);
close(sysfsFD);
}
#define VENDOR_AMD 0x1002
static bool gpuinfo_amdgpu_get_device_handles(struct list_head *devices, unsigned *count) {
if (!libdrm_handle)
return false;
last_libdrm_return_status = wrap_drmGetDevices(NULL, 0);
if (last_libdrm_return_status <= 0)
return false;
drmDevicePtr devs[last_libdrm_return_status];
last_libdrm_return_status = wrap_drmGetDevices(devs, last_libdrm_return_status);
if (last_libdrm_return_status <= 0)
return false;
unsigned int libdrm_count = last_libdrm_return_status;
gpu_infos = calloc(libdrm_count, sizeof(*gpu_infos));
if (!gpu_infos) {
local_error_string = strerror(errno);
return false;
}
for (unsigned int i = 0; i < libdrm_count; i++) {
if (devs[i]->bustype != DRM_BUS_PCI || devs[i]->deviceinfo.pci->vendor_id != VENDOR_AMD)
continue;
int fd = -1;
// Try render node first
if (1 << DRM_NODE_RENDER & devs[i]->available_nodes) {
fd = open(devs[i]->nodes[DRM_NODE_RENDER], O_RDWR);
}
if (fd < 0) {
// Fallback to primary node (control nodes are unused according to the DRM documentation)
if (1 << DRM_NODE_PRIMARY & devs[i]->available_nodes) {
fd = open(devs[i]->nodes[DRM_NODE_PRIMARY], O_RDWR);
}
}
if (fd < 0)
continue;
drmVersionPtr ver = _drmGetVersion(fd);
if (!ver) {
close(fd);
continue;
}
bool is_radeon = false; // TODO: !strcmp(ver->name, "radeon");
bool is_amdgpu = !strcmp(ver->name, "amdgpu");
if (!is_amdgpu && !is_radeon) {
_drmFreeVersion(ver);
close(fd);
continue;
}
authenticate_drm(fd);
if (is_amdgpu) {
if (!libdrm_amdgpu_handle || !_amdgpu_device_initialize) {
_drmFreeVersion(ver);
close(fd);
continue;
}
uint32_t drm_major, drm_minor;
last_libdrm_return_status =
_amdgpu_device_initialize(fd, &drm_major, &drm_minor, &gpu_infos[amdgpu_count].amdgpu_device);
} else {
// TODO: radeon suppport here
assert(false);
}
if (!last_libdrm_return_status) {
gpu_infos[amdgpu_count].drmVersion = ver;
gpu_infos[amdgpu_count].fd = fd;
gpu_infos[amdgpu_count].base.vendor = &gpu_vendor_amdgpu;
snprintf(gpu_infos[amdgpu_count].base.pdev, PDEV_LEN - 1, "%04x:%02x:%02x.%d", devs[i]->businfo.pci->domain,
devs[i]->businfo.pci->bus, devs[i]->businfo.pci->dev, devs[i]->businfo.pci->func);
initDeviceSysfsPaths(&gpu_infos[amdgpu_count]);
list_add_tail(&gpu_infos[amdgpu_count].base.list, devices);
// Register a fdinfo callback for this GPU
processinfo_register_fdinfo_callback(parse_drm_fdinfo_amd, &gpu_infos[amdgpu_count].base);
amdgpu_count++;
} else {
_drmFreeVersion(ver);
close(fd);
continue;
}
}
_drmFreeDevices(devs, libdrm_count);
*count = amdgpu_count;
return true;
}
static int rewindAndReadPattern(FILE *file, const char *format, ...) {
if (!file)
return 0;
va_list args;
va_start(args, format);
rewind(file);
fflush(file);
int matches = vfscanf(file, format, args);
va_end(args);
return matches;
}
static int readAttributeFromDevice(nvtop_device *dev, const char *sysAttr, const char *format, ...) {
va_list args;
va_start(args, format);
const char *val;
int ret = nvtop_device_get_sysattr_value(dev, sysAttr, &val);
if (ret < 0)
return ret;
// Read the pattern
int nread = vsscanf(val, format, args);
va_end(args);
return nread;
}
static void gpuinfo_amdgpu_populate_static_info(struct gpu_info *_gpu_info) {
struct gpu_info_amdgpu *gpu_info = container_of(_gpu_info, struct gpu_info_amdgpu, base);
struct gpuinfo_static_info *static_info = &gpu_info->base.static_info;
bool info_query_success = false;
struct amdgpu_gpu_info info;
const char *name = NULL;
static_info->integrated_graphics = false;
static_info->encode_decode_shared = false;
RESET_ALL(static_info->valid);
if (libdrm_amdgpu_handle && _amdgpu_get_marketing_name)
name = _amdgpu_get_marketing_name(gpu_info->amdgpu_device);
if (libdrm_amdgpu_handle && _amdgpu_query_gpu_info)
info_query_success = !_amdgpu_query_gpu_info(gpu_info->amdgpu_device, &info);
/* check name again.
* the previous name is from libdrm, which may not be the latest version.
* it may not contain latest AMD GPU types/names
*
* the libdrm is from vendor, Linux and a Linux distribution.
* It may take long time for a Linux distribution to get latest GPU info.
* here a GPU IDS is maintained, which allows to support GPU info faster. */
if (!name) {
name = amdgpu_parse_marketing_name(&info);
}
static_info->device_name[MAX_DEVICE_NAME - 1] = '\0';
if (name && strlen(name)) {
strncpy(static_info->device_name, name, MAX_DEVICE_NAME - 1);
SET_VALID(gpuinfo_device_name_valid, static_info->valid);
} else if (gpu_info->drmVersion->desc && strlen(gpu_info->drmVersion->desc)) {
strncpy(static_info->device_name, gpu_info->drmVersion->desc, MAX_DEVICE_NAME - 1);
SET_VALID(gpuinfo_device_name_valid, static_info->valid);
if (info_query_success) {
size_t len = strlen(static_info->device_name);
assert(len < MAX_DEVICE_NAME);
char *dst = static_info->device_name + len;
size_t remaining_len = MAX_DEVICE_NAME - 1 - len;
switch (info.family_id) {
#ifdef AMDGPU_FAMILY_SI
case AMDGPU_FAMILY_SI:
strncpy(dst, " (Hainan / Oland / Verde / Pitcairn / Tahiti)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_CI
case AMDGPU_FAMILY_CI:
strncpy(dst, " (Bonaire / Hawaii)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_KV
case AMDGPU_FAMILY_KV:
strncpy(dst, " (Kaveri / Kabini / Mullins)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_VI
case AMDGPU_FAMILY_VI:
strncpy(dst, " (Iceland / Tonga)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_CZ
case AMDGPU_FAMILY_CZ:
strncpy(dst, " (Carrizo / Stoney)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_AI
case AMDGPU_FAMILY_AI:
strncpy(dst, " (Vega10)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_RV
case AMDGPU_FAMILY_RV:
strncpy(dst, " (Raven)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_NV
case AMDGPU_FAMILY_NV:
strncpy(dst, " (Navi10)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_VGH
case AMDGPU_FAMILY_VGH:
strncpy(dst, " (Van Gogh)", remaining_len);
break;
#endif
#ifdef AMDGPU_FAMILY_YC
case AMDGPU_FAMILY_YC:
strncpy(dst, " (Yellow Carp)", remaining_len);
break;
#endif
default:
break;
}
}
}
// Retrieve infos from sysfs.
// 1) Fan
// If multiple fans are present, use the first one. Some hardware do not wire
// the sensor for the second fan, or use the same value as the first fan.
// Critical temparature
// temp1_* files should always be the GPU die in millidegrees Celsius
unsigned criticalTemp;
int NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "temp1_crit", "%u", &criticalTemp);
if (NreadPatterns == 1) {
SET_GPUINFO_STATIC(static_info, temperature_slowdown_threshold, criticalTemp);
}
// Emergency/shutdown temparature
unsigned emergemcyTemp;
NreadPatterns = readAttributeFromDevice(gpu_info->hwmonDevice, "temp1_emergency", "%u", &emergemcyTemp);
if (NreadPatterns == 1) {
SET_GPUINFO_STATIC(static_info, temperature_shutdown_threshold, emergemcyTemp);
}
nvtop_pcie_link max_link_characteristics;
int ret = nvtop_device_maximum_pcie_link(gpu_info->amdgpuDevice, &max_link_characteristics);
if (ret >= 0) {
SET_GPUINFO_STATIC(static_info, max_pcie_link_width, max_link_characteristics.width);
unsigned pcieGen = nvtop_pcie_gen_from_link_speed(max_link_characteristics.speed);
SET_GPUINFO_STATIC(static_info, max_pcie_gen, pcieGen);
}
// Mark integrated graphics
if (info_query_success && (info.ids_flags & AMDGPU_IDS_FLAGS_FUSION)) {
static_info->integrated_graphics = true;
}
// Checking if Encode and Decode are unified:AMDGPU_INFO_HW_IP_INFO
if (_amdgpu_query_hw_ip_info) {
struct drm_amdgpu_info_hw_ip vcn_ip_info;
if (_amdgpu_query_hw_ip_info(gpu_info->amdgpu_device, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_ip_info) == 0) {
static_info->encode_decode_shared = vcn_ip_info.hw_ip_version_major >= 4;
}
}
}
static void gpuinfo_amdgpu_refresh_dynamic_info(struct gpu_info *_gpu_info) {
struct gpu_info_amdgpu *gpu_info = container_of(_gpu_info, struct gpu_info_amdgpu, base);
struct gpuinfo_dynamic_info *dynamic_info = &gpu_info->base.dynamic_info;
bool info_query_success = false;
struct amdgpu_gpu_info info;
uint32_t out32;
RESET_ALL(dynamic_info->valid);
if (libdrm_amdgpu_handle && _amdgpu_query_gpu_info)
info_query_success = !_amdgpu_query_gpu_info(gpu_info->amdgpu_device, &info);
// GPU current speed
if (libdrm_amdgpu_handle && _amdgpu_query_sensor_info)
last_libdrm_return_status =
_amdgpu_query_sensor_info(gpu_info->amdgpu_device, AMDGPU_INFO_SENSOR_GFX_SCLK, sizeof(out32), &out32);
else
last_libdrm_return_status = 1;
if (!last_libdrm_return_status) {
SET_GPUINFO_DYNAMIC(dynamic_info, gpu_clock_speed, out32);
}
// GPU max speed
if (info_query_success) {
SET_GPUINFO_DYNAMIC(dynamic_info, gpu_clock_speed_max, info.max_engine_clk / 1000);
}
// Memory current speed
if (libdrm_amdgpu_handle && _amdgpu_query_sensor_info)
last_libdrm_return_status =
_amdgpu_query_sensor_info(gpu_info->amdgpu_device, AMDGPU_INFO_SENSOR_GFX_MCLK, sizeof(out32), &out32);
else
last_libdrm_return_status = 1;
if (!last_libdrm_return_status) {
SET_GPUINFO_DYNAMIC(dynamic_info, mem_clock_speed, out32);
}
// Memory max speed
if (info_query_success) {
SET_GPUINFO_DYNAMIC(dynamic_info, mem_clock_speed_max, info.max_memory_clk / 1000);
}
// Load
if (libdrm_amdgpu_handle && _amdgpu_query_sensor_info)
last_libdrm_return_status =
_amdgpu_query_sensor_info(gpu_info->amdgpu_device, AMDGPU_INFO_SENSOR_GPU_LOAD, sizeof(out32), &out32);
else
last_libdrm_return_status = 1;
if (!last_libdrm_return_status) {
SET_GPUINFO_DYNAMIC(dynamic_info, gpu_util_rate, out32);
}
// Memory usage
struct drm_amdgpu_memory_info memory_info;
if (libdrm_amdgpu_handle && _amdgpu_query_info)
last_libdrm_return_status =
_amdgpu_query_info(gpu_info->amdgpu_device, AMDGPU_INFO_MEMORY, sizeof(memory_info), &memory_info);
else
last_libdrm_return_status = 1;
if (!last_libdrm_return_status) {
// TODO: Determine if we want to include GTT (GPU accessible system memory)
SET_GPUINFO_DYNAMIC(dynamic_info, total_memory, memory_info.vram.total_heap_size);
SET_GPUINFO_DYNAMIC(dynamic_info, used_memory, memory_info.vram.heap_usage);
SET_GPUINFO_DYNAMIC(dynamic_info, free_memory, memory_info.vram.usable_heap_size - dynamic_info->used_memory);
SET_GPUINFO_DYNAMIC(dynamic_info, mem_util_rate,
(dynamic_info->total_memory - dynamic_info->free_memory) * 100 / dynamic_info->total_memory);
}
// GPU temperature
if (libdrm_amdgpu_handle && _amdgpu_query_sensor_info)
last_libdrm_return_status =
_amdgpu_query_sensor_info(gpu_info->amdgpu_device, AMDGPU_INFO_SENSOR_GPU_TEMP, sizeof(out32), &out32);
else
last_libdrm_return_status = 1;
if (!last_libdrm_return_status) {
SET_GPUINFO_DYNAMIC(dynamic_info, gpu_temp, out32 / 1000);
}
// Fan speed
unsigned currentFanSpeed;
int patternsMatched = rewindAndReadPattern(gpu_info->fanSpeedFILE, "%u", &currentFanSpeed);
if (patternsMatched == 1) {
SET_GPUINFO_DYNAMIC(dynamic_info, fan_speed, currentFanSpeed * 100 / gpu_info->maxFanValue);
}
// Device power usage
if (libdrm_amdgpu_handle && _amdgpu_query_sensor_info)
last_libdrm_return_status =
_amdgpu_query_sensor_info(gpu_info->amdgpu_device, AMDGPU_INFO_SENSOR_GPU_AVG_POWER, sizeof(out32), &out32);
else
last_libdrm_return_status = 1;
if (!last_libdrm_return_status) {
SET_GPUINFO_DYNAMIC(dynamic_info, power_draw, out32 * 1000);
}
nvtop_pcie_link curr_link_characteristics;
int ret = nvtop_device_current_pcie_link(gpu_info->amdgpuDevice, &curr_link_characteristics);
if (ret >= 0) {
SET_GPUINFO_DYNAMIC(dynamic_info, pcie_link_width, curr_link_characteristics.width);
unsigned pcieGen = nvtop_pcie_gen_from_link_speed(curr_link_characteristics.speed);
SET_GPUINFO_DYNAMIC(dynamic_info, pcie_link_gen, pcieGen);
}
// PCIe bandwidth
if (gpu_info->PCIeBW) {
// According to https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/amd/pm/amdgpu_pm.c, under the pcie_bw
// section, we should be able to read the number of packets received and sent by the GPU and get the maximum payload
// size during the last second. This is untested but should work when the file is populated by the driver.
uint64_t received, transmitted;
int maxPayloadSize;
int NreadPatterns =
rewindAndReadPattern(gpu_info->PCIeBW, "%" SCNu64 " %" SCNu64 " %i", &received, &transmitted, &maxPayloadSize);
if (NreadPatterns == 3) {
received *= maxPayloadSize;
transmitted *= maxPayloadSize;
// Set in KiB
received /= 1024;
transmitted /= 1024;
SET_GPUINFO_DYNAMIC(dynamic_info, pcie_rx, received);
SET_GPUINFO_DYNAMIC(dynamic_info, pcie_tx, transmitted);
}
}
if (gpu_info->powerCap) {
// The power cap in microwatts
unsigned powerCap;
int NreadPatterns = rewindAndReadPattern(gpu_info->powerCap, "%u", &powerCap);
if (NreadPatterns == 1) {
SET_GPUINFO_DYNAMIC(dynamic_info, power_draw_max, powerCap / 1000);
}
}
}
static const char drm_amdgpu_pdev_old[] = "pdev";
static const char drm_amdgpu_vram_old[] = "vram mem";
static const char drm_amdgpu_vram[] = "drm-memory-vram";
static const char drm_amdgpu_gfx_old[] = "gfx";
static const char drm_amdgpu_gfx[] = "drm-engine-gfx";
static const char drm_amdgpu_compute_old[] = "compute";
static const char drm_amdgpu_compute[] = "drm-engine-compute";
static const char drm_amdgpu_dec_old[] = "dec";
static const char drm_amdgpu_dec[] = "drm-engine-dec";
static const char drm_amdgpu_enc_old[] = "enc";
static const char drm_amdgpu_enc[] = "drm-engine-enc";
static bool parse_drm_fdinfo_amd(struct gpu_info *info, FILE *fdinfo_file, struct gpu_process *process_info) {
struct gpu_info_amdgpu *gpu_info = container_of(info, struct gpu_info_amdgpu, base);
static char *line = NULL;
static size_t line_buf_size = 0;
ssize_t count = 0;
bool client_id_set = false;
unsigned cid;
nvtop_time current_time;
nvtop_get_current_time(&current_time);
while ((count = getline(&line, &line_buf_size, fdinfo_file)) != -1) {
char *key, *val;
// Get rid of the newline if present
if (line[count - 1] == '\n') {
line[--count] = '\0';
}
if (!extract_drm_fdinfo_key_value(line, &key, &val))
continue;
// see drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c amdgpu_show_fdinfo()
if (!strcmp(key, drm_amdgpu_pdev_old) || !strcmp(key, drm_pdev)) {
if (strcmp(val, gpu_info->base.pdev)) {
return false;
}
} else if (!strcmp(key, drm_client_id)) {
// Client id is a unique identifier. From the DRM documentation "Unique value relating to the open DRM
// file descriptor used to distinguish duplicated and shared file descriptors. Conceptually the value should map
// 1:1 to the in kernel representation of struct drm_file instances."
char *endptr;
cid = strtoul(val, &endptr, 10);
if (*endptr)
continue;
client_id_set = true;
} else if (!strcmp(key, drm_amdgpu_vram_old) || !strcmp(key, drm_amdgpu_vram)) {
// TODO: do we count "gtt mem" too?
unsigned long mem_int;
char *endptr;
mem_int = strtoul(val, &endptr, 10);
if (endptr == val || (strcmp(endptr, " kB") && strcmp(endptr, " KiB")))
continue;
SET_GPUINFO_PROCESS(process_info, gpu_memory_usage, mem_int * 1024);
} else {
bool is_gfx_old = !strncmp(key, drm_amdgpu_gfx_old, sizeof(drm_amdgpu_gfx_old) - 1);
bool is_compute_old = !strncmp(key, drm_amdgpu_compute_old, sizeof(drm_amdgpu_compute_old) - 1);
bool is_dec_old = !strncmp(key, drm_amdgpu_dec_old, sizeof(drm_amdgpu_dec_old) - 1);
bool is_enc_old = !strncmp(key, drm_amdgpu_enc_old, sizeof(drm_amdgpu_enc_old) - 1);
bool is_gfx_new = !strncmp(key, drm_amdgpu_gfx, sizeof(drm_amdgpu_gfx) - 1);
bool is_dec_new = !strncmp(key, drm_amdgpu_dec, sizeof(drm_amdgpu_dec) - 1);
bool is_enc_new = !strncmp(key, drm_amdgpu_enc, sizeof(drm_amdgpu_enc) - 1);
bool is_compute_new = !strncmp(key, drm_amdgpu_compute, sizeof(drm_amdgpu_compute) - 1);
if (is_gfx_old || is_compute_old || is_dec_old || is_enc_old) {
// The old interface exposes a usage percentage with an unknown update interval
unsigned int usage_percent_int;
char *key_off, *endptr;
double usage_percent;
if (is_gfx_old)
key_off = key + sizeof(drm_amdgpu_gfx_old) - 1;
else if (is_compute_old)
key_off = key + sizeof(drm_amdgpu_compute_old) - 1;
else if (is_dec_old)
key_off = key + sizeof(drm_amdgpu_dec_old) - 1;
else if (is_enc_old)
key_off = key + sizeof(drm_amdgpu_enc_old) - 1;
else
continue;
// The prefix should be followed by a number and only a number
if (!*key_off)
continue;
strtoul(key_off, &endptr, 10);
if (*endptr)
continue;
usage_percent_int = (unsigned int)(usage_percent = round(strtod(val, &endptr)));
if (endptr == val || strcmp(endptr, "%"))
continue;
if (is_gfx_old) {
process_info->type |= gpu_process_graphical;
SET_GPUINFO_PROCESS(process_info, gpu_usage, process_info->gpu_usage + usage_percent_int);
} else if (is_compute_old) {
process_info->type |= gpu_process_compute;
SET_GPUINFO_PROCESS(process_info, gpu_usage, process_info->gpu_usage + usage_percent_int);
} else if (is_dec_old) {
SET_GPUINFO_PROCESS(process_info, decode_usage, process_info->decode_usage + usage_percent_int);
} else if (is_enc_old) {
SET_GPUINFO_PROCESS(process_info, encode_usage, process_info->encode_usage + usage_percent_int);
}
} else if (is_gfx_new || is_compute_new || is_dec_new || is_enc_new) {
char *endptr;
uint64_t time_spent = strtoull(val, &endptr, 10);
if (endptr == val || strcmp(endptr, " ns"))
continue;
if (is_gfx_new) {
process_info->type |= gpu_process_graphical;
SET_GPUINFO_PROCESS(process_info, gfx_engine_used, time_spent);
} else if (is_compute_new) {
process_info->type |= gpu_process_compute;
SET_GPUINFO_PROCESS(process_info, compute_engine_used, time_spent);
} else if (is_enc_new) {
SET_GPUINFO_PROCESS(process_info, enc_engine_used, time_spent);
} else if (is_dec_new) {
SET_GPUINFO_PROCESS(process_info, dec_engine_used, time_spent);
}
}
}
}
// The AMDGPU fdinfo interface in kernels >=5.19 is way nicer; it provides the
// cumulative GPU engines (e.g., gfx, enc, dec) usage in nanoseconds.
// Previously, we displayed the usage provided in fdinfo by the kernel/driver
// which uses an internal update interval. Now, we can compute an accurate
// busy percentage since the last measurement.
if (client_id_set) {
struct amdgpu_process_info_cache *cache_entry;
struct unique_cache_id ucid = {.client_id = cid, .pid = process_info->pid, .pdev = gpu_info->base.pdev};
HASH_FIND_CLIENT(gpu_info->last_update_process_cache, &ucid, cache_entry);
if (cache_entry) {
uint64_t time_elapsed = nvtop_difftime_u64(cache_entry->last_measurement_tstamp, current_time);
HASH_DEL(gpu_info->last_update_process_cache, cache_entry);
if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used) &&
AMDGPU_CACHE_FIELD_VALID(cache_entry, gfx_engine_used) &&
// In some rare occasions, the gfx engine usage reported by the driver is lowering (might be a driver bug)
process_info->gfx_engine_used >= cache_entry->gfx_engine_used &&
process_info->gfx_engine_used - cache_entry->gfx_engine_used <= time_elapsed) {
SET_GPUINFO_PROCESS(process_info, gpu_usage,
busy_usage_from_time_usage_round(process_info->gfx_engine_used,
cache_entry->gfx_engine_used, time_elapsed));
}
if (GPUINFO_PROCESS_FIELD_VALID(process_info, compute_engine_used) &&
AMDGPU_CACHE_FIELD_VALID(cache_entry, compute_engine_used) &&
process_info->compute_engine_used >= cache_entry->compute_engine_used &&
process_info->compute_engine_used - cache_entry->compute_engine_used <= time_elapsed) {
unsigned gfx_usage = GPUINFO_PROCESS_FIELD_VALID(process_info, gpu_usage) ? process_info->gpu_usage : 0;
SET_GPUINFO_PROCESS(process_info, gpu_usage,
gfx_usage + busy_usage_from_time_usage_round(process_info->compute_engine_used,
cache_entry->compute_engine_used,
time_elapsed));
}
if (GPUINFO_PROCESS_FIELD_VALID(process_info, dec_engine_used) &&
AMDGPU_CACHE_FIELD_VALID(cache_entry, dec_engine_used) &&
process_info->dec_engine_used >= cache_entry->dec_engine_used &&
process_info->dec_engine_used - cache_entry->dec_engine_used <= time_elapsed) {
SET_GPUINFO_PROCESS(process_info, decode_usage,
busy_usage_from_time_usage_round(process_info->dec_engine_used,
cache_entry->dec_engine_used, time_elapsed));
}
if (GPUINFO_PROCESS_FIELD_VALID(process_info, enc_engine_used) &&
AMDGPU_CACHE_FIELD_VALID(cache_entry, enc_engine_used) &&
process_info->enc_engine_used >= cache_entry->enc_engine_used &&
process_info->enc_engine_used - cache_entry->enc_engine_used <= time_elapsed) {
SET_GPUINFO_PROCESS(process_info, encode_usage,
busy_usage_from_time_usage_round(process_info->enc_engine_used,
cache_entry->enc_engine_used, time_elapsed));
}
} else {
cache_entry = calloc(1, sizeof(*cache_entry));
if (!cache_entry)
goto parse_fdinfo_exit;
cache_entry->client_id.client_id = cid;
cache_entry->client_id.pid = process_info->pid;
cache_entry->client_id.pdev = gpu_info->base.pdev;
}
#ifndef NDEBUG
// We should only process one fdinfo entry per client id per update
struct amdgpu_process_info_cache *cache_entry_check;
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
assert(!cache_entry_check && "We should not be processing a client id twice per update");
#endif
// Store this measurement data
RESET_ALL(cache_entry->valid);
if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))
SET_AMDGPU_CACHE(cache_entry, gfx_engine_used, process_info->gfx_engine_used);
if (GPUINFO_PROCESS_FIELD_VALID(process_info, compute_engine_used))
SET_AMDGPU_CACHE(cache_entry, compute_engine_used, process_info->compute_engine_used);
if (GPUINFO_PROCESS_FIELD_VALID(process_info, dec_engine_used))
SET_AMDGPU_CACHE(cache_entry, dec_engine_used, process_info->dec_engine_used);
if (GPUINFO_PROCESS_FIELD_VALID(process_info, enc_engine_used))
SET_AMDGPU_CACHE(cache_entry, enc_engine_used, process_info->enc_engine_used);
cache_entry->last_measurement_tstamp = current_time;
HASH_ADD_CLIENT(gpu_info->current_update_process_cache, cache_entry);
}
parse_fdinfo_exit:
return true;
}
static void swap_process_cache_for_next_update(struct gpu_info_amdgpu *gpu_info) {
// Free old cache data and set the cache for the next update
if (gpu_info->last_update_process_cache) {
struct amdgpu_process_info_cache *cache_entry, *tmp;
HASH_ITER(hh, gpu_info->last_update_process_cache, cache_entry, tmp) {
HASH_DEL(gpu_info->last_update_process_cache, cache_entry);
free(cache_entry);
}
}
gpu_info->last_update_process_cache = gpu_info->current_update_process_cache;
gpu_info->current_update_process_cache = NULL;
}
static void gpuinfo_amdgpu_get_running_processes(struct gpu_info *_gpu_info) {
// For AMDGPU, we register a fdinfo callback that will fill the gpu_process datastructure of the gpu_info structure
// for us. This avoids going through /proc multiple times per update for multiple GPUs.
struct gpu_info_amdgpu *gpu_info = container_of(_gpu_info, struct gpu_info_amdgpu, base);
swap_process_cache_for_next_update(gpu_info);
}