From 65ba47478c5a02b7260fd0a16ed1bd003482968e Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 14:14:46 +0000 Subject: [PATCH 01/20] Begin adding AMD support. --- gpustat/core.py | 15 +++++--- gpustat/rocml.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 6 deletions(-) create mode 100644 gpustat/rocml.py diff --git a/gpustat/core.py b/gpustat/core.py index 3ab9783..e3b33dd 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -30,9 +30,9 @@ from blessed import Terminal from gpustat import util -from gpustat import nvml -from gpustat.nvml import pynvml as N -from gpustat.nvml import check_driver_nvml_version +from gpustat import rocml as nvml +from gpustat import rocml as N +from gpustat.rocml import check_driver_nvml_version NOT_SUPPORTED = 'Not Supported' MB = 1024 * 1024 @@ -555,6 +555,7 @@ def _wrapped(*args, **kwargs): processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] + print(nv_comp_processes) # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() @@ -608,10 +609,12 @@ def _wrapped(*args, **kwargs): handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) - except N.NVMLError_Unknown as e: + except Exception as e: gpu_stat = InvalidGPU(index, "((Unknown Error))", e) - except N.NVMLError_GpuIsLost as e: - gpu_stat = InvalidGPU(index, "((GPU is lost))", e) + #except N.NVMLError_Unknown as e: + # gpu_stat = InvalidGPU(index, "((Unknown Error))", e) + #except N.NVMLError_GpuIsLost as e: + # gpu_stat = InvalidGPU(index, "((GPU is lost))", e) if isinstance(gpu_stat, InvalidGPU): log.add_exception("GPU %d" % index, gpu_stat.exception) diff --git a/gpustat/rocml.py b/gpustat/rocml.py new file mode 100644 index 0000000..405aa2c --- /dev/null +++ b/gpustat/rocml.py @@ -0,0 +1,99 @@ +"""Imports pynvml with sanity checks and custom patches.""" + +# pylint: disable=protected-access + +import atexit +import functools +import os +import sys +import textwrap +import warnings + +from collections import namedtuple + + +from pyrsmi import rocml + +NVML_TEMPERATURE_GPU = 1 + +def nvmlDeviceGetCount(): + return rocml.smi_get_device_count() + + +def nvmlDeviceGetHandleByIndex(dev): + return dev + +def nvmlDeviceGetIndex(dev): + return dev + +def nvmlDeviceGetName(dev): + return rocml.smi_get_device_name(dev) + +def nvmlDeviceGetUUID(dev): + return rocml.smi_get_device_uuid(dev) + +def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): + return rocml.smi_get_device_temp(dev, loc) + +def nvmlSystemGetDriverVersion(): + return rocml.smi_get_kernel_version() + +def check_driver_nvml_version(driver_version_str: str): + return + +def nvmlDeviceGetFanSpeed(dev): + return None#rocml.smi_get_device_fan_speed(dev) + +MemoryInfo = namedtuple('MemoryInfo', ['total', 'used']) + +def nvmlDeviceGetMemoryInfo(dev): + return MemoryInfo(total=rocml.smi_get_device_memory_total(dev), used=rocml.smi_get_device_memory_used(dev)) + +UtilizationRates = namedtuple('UtilizationRates', ['gpu']) + +def nvmlDeviceGetUtilizationRates(dev): + return UtilizationRates(gpu=rocml.smi_get_device_utilization(dev)) + +def nvmlDeviceGetEncoderUtilization(dev): + return None + +def nvmlDeviceGetDecoderUtilization(dev): + return None + +def nvmlDeviceGetPowerUsage(dev): + return None#rocml.smi_get_device_average_power(dev) + +def nvmlDeviceGetEnforcedPowerLimit(dev): + return None + +ComputeProcess = namedtuple('ComputeProcess', ['pid']) + +def nvmlDeviceGetComputeRunningProcesses(dev): + return [ComputeProcess(pid=i) for i in rocml.smi_get_device_compute_process()] + +def nvmlDeviceGetGraphicsRunningProcesses(dev): + return None + +# Upon importing this module, let pynvml be initialized and remain active +# throughout the lifespan of the python process (until gpustat exists). +_initialized: bool +_init_error = None +try: + rocml.smi_initialize() + _initialized = True + + def _shutdown(): + rocml.smi_shutdown() + atexit.register(_shutdown) + +except pynvml.NVMLError as exc: + _initialized = False + _init_error = exc + + +def ensure_initialized(): + if not _initialized: + raise _init_error # type: ignore + + + From 261faf79e0fd4b669c167beee1f34932a3e4d982 Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 14:22:31 +0000 Subject: [PATCH 02/20] Add pyrsmi depedency. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c1a47f4..513d5ee 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,7 @@ def run(self): install_requires = [ 'nvidia-ml-py>=12.535.108', # see #107, #143, #161 + 'pyrsmi', #137 'psutil>=5.6.0', # GH-1447 'blessed>=1.17.1', # GH-126 'typing_extensions', From ca650baeef26579cfaf09fed1836150c294a1bb4 Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 21:42:36 +0000 Subject: [PATCH 03/20] Add simple hardware switch functionalty. --- gpustat/core.py | 21 +++++++++++++-------- gpustat/rocml.py | 13 +++++++++++-- gpustat/util.py | 9 +++++++++ 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index e3b33dd..87c3dc4 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -30,9 +30,15 @@ from blessed import Terminal from gpustat import util -from gpustat import rocml as nvml -from gpustat import rocml as N -from gpustat.rocml import check_driver_nvml_version + +if util.hasNvidia(): + from gpustat import nvml + from gpustat.nvml import nvml as N + from gpustat.nvml import check_driver_nvml_version +else: + from gpustat import rocml as nvml + from gpustat import rocml as N + from gpustat.rocml import check_driver_nvml_version NOT_SUPPORTED = 'Not Supported' MB = 1024 * 1024 @@ -555,7 +561,6 @@ def _wrapped(*args, **kwargs): processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] - print(nv_comp_processes) # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() @@ -611,10 +616,10 @@ def _wrapped(*args, **kwargs): gpu_stat = GPUStat(gpu_info) except Exception as e: gpu_stat = InvalidGPU(index, "((Unknown Error))", e) - #except N.NVMLError_Unknown as e: - # gpu_stat = InvalidGPU(index, "((Unknown Error))", e) - #except N.NVMLError_GpuIsLost as e: - # gpu_stat = InvalidGPU(index, "((GPU is lost))", e) + except N.NVMLError_Unknown as e: + gpu_stat = InvalidGPU(index, "((Unknown Error))", e) + except N.NVMLError_GpuIsLost as e: + gpu_stat = InvalidGPU(index, "((GPU is lost))", e) if isinstance(gpu_stat, InvalidGPU): log.add_exception("GPU %d" % index, gpu_stat.exception) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 405aa2c..32a6d0b 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -1,4 +1,4 @@ -"""Imports pynvml with sanity checks and custom patches.""" +"""Imports pyrsmi and wraps it in a pynvml compatible interface.""" # pylint: disable=protected-access @@ -16,10 +16,19 @@ NVML_TEMPERATURE_GPU = 1 +class NVMLError_Unknown(Exception): + def __init__(self, message="An unknown ROCMLError has occurred"): + self.message = message + super().__init__(self.message) + +class NVMLError_GpuIsLost(Exception): + def __init__(self, message="ROCM Device is lost."): + self.message = message + super().__init__(self.message) + def nvmlDeviceGetCount(): return rocml.smi_get_device_count() - def nvmlDeviceGetHandleByIndex(dev): return dev diff --git a/gpustat/util.py b/gpustat/util.py index 7335dca..d865654 100644 --- a/gpustat/util.py +++ b/gpustat/util.py @@ -2,6 +2,7 @@ import collections import os.path +import subprocess import sys import traceback from typing import Callable, Tuple, Type, TypeVar, Union @@ -101,3 +102,11 @@ def report_summary(self, concise=True): self._write("{msg} -> Total {value} occurrences.".format( msg=msg, value=value)) self._write('') + + +def hasNvidia(): + try: + subprocess.check_output('nvidia-smi') + return True + except Exception: + return False From 5b229f81497e5b47e495275df43ac5b53983b1b4 Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 21:43:29 +0000 Subject: [PATCH 04/20] Move default exception to end --- gpustat/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index 87c3dc4..e0a2804 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -614,12 +614,12 @@ def _wrapped(*args, **kwargs): handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) - except Exception as e: - gpu_stat = InvalidGPU(index, "((Unknown Error))", e) except N.NVMLError_Unknown as e: gpu_stat = InvalidGPU(index, "((Unknown Error))", e) except N.NVMLError_GpuIsLost as e: gpu_stat = InvalidGPU(index, "((GPU is lost))", e) + except Exception as e: + gpu_stat = InvalidGPU(index, "((Unknown Error))", e) if isinstance(gpu_stat, InvalidGPU): log.add_exception("GPU %d" % index, gpu_stat.exception) From 3c1a7443028fbcfc388e24d3cde548fedf02ed6b Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 21:45:21 +0000 Subject: [PATCH 05/20] Typo --- gpustat/rocml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 32a6d0b..e99fd05 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -83,7 +83,7 @@ def nvmlDeviceGetComputeRunningProcesses(dev): def nvmlDeviceGetGraphicsRunningProcesses(dev): return None -# Upon importing this module, let pynvml be initialized and remain active +# Upon importing this module, let rocml be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool _init_error = None @@ -95,7 +95,7 @@ def _shutdown(): rocml.smi_shutdown() atexit.register(_shutdown) -except pynvml.NVMLError as exc: +except Exception as exc: _initialized = False _init_error = exc From 8ba8134316953b0a3ff3cf9c715eef42c4404c54 Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 21:52:17 +0000 Subject: [PATCH 06/20] Default to nvidia. --- gpustat/core.py | 10 +++++----- gpustat/util.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index e0a2804..2fcbf12 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -31,14 +31,14 @@ from gpustat import util -if util.hasNvidia(): - from gpustat import nvml - from gpustat.nvml import nvml as N - from gpustat.nvml import check_driver_nvml_version -else: +if util.hasAMD(): from gpustat import rocml as nvml from gpustat import rocml as N from gpustat.rocml import check_driver_nvml_version +else: + from gpustat import nvml + from gpustat.nvml import nvml as N + from gpustat.nvml import check_driver_nvml_version NOT_SUPPORTED = 'Not Supported' MB = 1024 * 1024 diff --git a/gpustat/util.py b/gpustat/util.py index d865654..e6b0067 100644 --- a/gpustat/util.py +++ b/gpustat/util.py @@ -104,9 +104,9 @@ def report_summary(self, concise=True): self._write('') -def hasNvidia(): +def hasAMD(): try: - subprocess.check_output('nvidia-smi') + subprocess.check_output('rocm-smi') return True except Exception: return False From 9f07c495d52fb0e7fd81b88644e41b0f66816248 Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 29 Jul 2024 22:08:17 +0000 Subject: [PATCH 07/20] Typo... --- gpustat/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpustat/core.py b/gpustat/core.py index 2fcbf12..e251ed9 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -37,7 +37,7 @@ from gpustat.rocml import check_driver_nvml_version else: from gpustat import nvml - from gpustat.nvml import nvml as N + from gpustat.nvml import pynvml as N from gpustat.nvml import check_driver_nvml_version NOT_SUPPORTED = 'Not Supported' From 85d0dbf12ecf21a710a96ff3d6c787952b657dc0 Mon Sep 17 00:00:00 2001 From: brnelson Date: Tue, 30 Jul 2024 15:01:33 +0000 Subject: [PATCH 08/20] Hide output from rocml. --- gpustat/rocml.py | 49 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index e99fd05..3f81a41 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -26,8 +26,21 @@ def __init__(self, message="ROCM Device is lost."): self.message = message super().__init__(self.message) + +_stdout_dup = os.dup(1) +_stderr_dup = os.dup(2) +_silent_pipe = os.open(os.devnull, os.O_WRONLY) + +def silent_run(to_call, *args, **kwargs): + os.dup2(_silent_pipe, 1) + os.dup2(_silent_pipe, 2) + retval = to_call(*args, **kwargs) + os.dup2(_stdout_dup, 1) + os.dup2(_stderr_dup, 2) + return retval + def nvmlDeviceGetCount(): - return rocml.smi_get_device_count() + return silent_run(rocml.smi_get_device_count) def nvmlDeviceGetHandleByIndex(dev): return dev @@ -36,32 +49,45 @@ def nvmlDeviceGetIndex(dev): return dev def nvmlDeviceGetName(dev): - return rocml.smi_get_device_name(dev) + return silent_run(rocml.smi_get_device_name, dev) def nvmlDeviceGetUUID(dev): - return rocml.smi_get_device_uuid(dev) + return silent_run(rocml.smi_get_device_uuid, dev) def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): - return rocml.smi_get_device_temp(dev, loc) + return silent_run(rocml.smi_get_device_temp, dev, loc) def nvmlSystemGetDriverVersion(): - return rocml.smi_get_kernel_version() + return silent_run(rocml.smi_get_kernel_version) def check_driver_nvml_version(driver_version_str: str): - return + """Show warnings when an incompatible driver is used.""" + + def safeint(v) -> int: + try: + return int(v) + except (ValueError, TypeError): + return 0 + + driver_version = tuple(safeint(v) for v in + driver_version_str.strip().split(".")) + + if driver_version < (6, 7, 8): + warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ") def nvmlDeviceGetFanSpeed(dev): - return None#rocml.smi_get_device_fan_speed(dev) + return silent_run(rocml.smi_get_device_fan_speed, dev) MemoryInfo = namedtuple('MemoryInfo', ['total', 'used']) def nvmlDeviceGetMemoryInfo(dev): - return MemoryInfo(total=rocml.smi_get_device_memory_total(dev), used=rocml.smi_get_device_memory_used(dev)) + return MemoryInfo(total=silent_run(rocml.smi_get_device_memory_total, dev), + used=silent_run(rocml.smi_get_device_memory_used, dev)) UtilizationRates = namedtuple('UtilizationRates', ['gpu']) def nvmlDeviceGetUtilizationRates(dev): - return UtilizationRates(gpu=rocml.smi_get_device_utilization(dev)) + return UtilizationRates(gpu=silent_run(rocml.smi_get_device_utilization, dev)) def nvmlDeviceGetEncoderUtilization(dev): return None @@ -70,7 +96,7 @@ def nvmlDeviceGetDecoderUtilization(dev): return None def nvmlDeviceGetPowerUsage(dev): - return None#rocml.smi_get_device_average_power(dev) + return silent_run(rocml.smi_get_device_average_power, dev) def nvmlDeviceGetEnforcedPowerLimit(dev): return None @@ -78,7 +104,8 @@ def nvmlDeviceGetEnforcedPowerLimit(dev): ComputeProcess = namedtuple('ComputeProcess', ['pid']) def nvmlDeviceGetComputeRunningProcesses(dev): - return [ComputeProcess(pid=i) for i in rocml.smi_get_device_compute_process()] + processes = silent_run(rocml.smi_get_device_compute_process) + return [ComputeProcess(pid=i) for i in processes] def nvmlDeviceGetGraphicsRunningProcesses(dev): return None From 2c9aadff524e4d251f7bbf50e29567dde9d985fa Mon Sep 17 00:00:00 2001 From: brnelson Date: Tue, 30 Jul 2024 20:33:14 +0000 Subject: [PATCH 09/20] add frequency. --- gpustat/core.py | 15 +++++++++++++++ gpustat/rocml.py | 3 +++ 2 files changed, 18 insertions(+) diff --git a/gpustat/core.py b/gpustat/core.py index e251ed9..eeedb2e 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -206,6 +206,13 @@ def processes(self) -> Optional[List[ProcessInfo]]: """Get the list of running processes on the GPU.""" return self.entry['processes'] + @property + def clk_freq(self) -> Optional[int]: + """ + """ + v = self.entry['clk_freq'] + return int(v) if v is not None else None + def print_to(self, fp, *, with_colors=True, # deprecated arg show_cmd=False, @@ -335,6 +342,10 @@ def __getattr__(self, name): # type: ignore if show_power is True or 'limit' in show_power: _write(" / ") _write(rjustify(safe_self.power_limit, 3), ' W', color='CPowL') + + _write(", ") + _write(rjustify(safe_self.clk_freq, 3), color='CPowU') + _write(" MHz") # Memory _write(" | ") @@ -551,6 +562,10 @@ def _wrapped(*args, **kwargs): power_limit = safenvml(N.nvmlDeviceGetEnforcedPowerLimit)(handle) gpu_info['enforced.power.limit'] = power_limit // 1000 if power_limit is not None else None + # Frequency + freq = safenvml(N.nvmlDeviceGetClkFreq)(handle) + gpu_info['clk_freq'] = freq if freq is not None else None + # Processes nv_comp_processes = safenvml(N.nvmlDeviceGetComputeRunningProcesses)(handle) nv_graphics_processes = safenvml(N.nvmlDeviceGetGraphicsRunningProcesses)(handle) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 3f81a41..fadc8f8 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -110,6 +110,9 @@ def nvmlDeviceGetComputeRunningProcesses(dev): def nvmlDeviceGetGraphicsRunningProcesses(dev): return None +def nvmlDeviceGetClkFreq(dev): + return rocml.smi_get_device_freq(dev) + # Upon importing this module, let rocml be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool From cc2d0f03d3979d165cff3babb38f244715e889ce Mon Sep 17 00:00:00 2001 From: brnelson Date: Wed, 31 Jul 2024 19:51:55 +0000 Subject: [PATCH 10/20] Switching to amdsmi --- gpustat/core.py | 21 +++++++++++++++----- gpustat/rocml.py | 51 +++++++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 25 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index eeedb2e..f7cdd35 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -213,6 +213,13 @@ def clk_freq(self) -> Optional[int]: v = self.entry['clk_freq'] return int(v) if v is not None else None + @property + def clk_freq_max(self) -> Optional[int]: + """ + """ + v = self.entry['clk_freq_max'] + return int(v) if v is not None else None + def print_to(self, fp, *, with_colors=True, # deprecated arg show_cmd=False, @@ -345,6 +352,8 @@ def __getattr__(self, name): # type: ignore _write(", ") _write(rjustify(safe_self.clk_freq, 3), color='CPowU') + _write(" / ") + _write(rjustify(safe_self.clk_freq_max, 3), color='CPowU') _write(" MHz") # Memory @@ -472,7 +481,7 @@ def _decode(b: Union[str, bytes]) -> str: assert isinstance(b, str) return b - def get_gpu_info(handle: NVMLHandle) -> NvidiaGPUInfo: + def get_gpu_info(handle: NVMLHandle, index: int = None) -> NvidiaGPUInfo: """Get one GPU information specified by nvml handle""" def safepcall(fn: Callable[[], Any], error_value: Any): @@ -529,7 +538,7 @@ def _wrapped(*args, **kwargs): return _wrapped gpu_info = NvidiaGPUInfo() - gpu_info['index'] = N.nvmlDeviceGetIndex(handle) + gpu_info['index'] = N.nvmlDeviceGetIndex(handle) if index is None else index gpu_info['name'] = _decode(N.nvmlDeviceGetName(handle)) gpu_info['uuid'] = _decode(N.nvmlDeviceGetUUID(handle)) @@ -557,14 +566,16 @@ def _wrapped(*args, **kwargs): # Power power = safenvml(N.nvmlDeviceGetPowerUsage)(handle) - gpu_info['power.draw'] = power // 1000 if power is not None else None + gpu_info['power.draw'] = power if power is not None else None power_limit = safenvml(N.nvmlDeviceGetEnforcedPowerLimit)(handle) - gpu_info['enforced.power.limit'] = power_limit // 1000 if power_limit is not None else None + gpu_info['enforced.power.limit'] = power_limit if power_limit is not None else None # Frequency freq = safenvml(N.nvmlDeviceGetClkFreq)(handle) gpu_info['clk_freq'] = freq if freq is not None else None + freq_max = safenvml(N.nvmlDeviceGetClkFreqMax)(handle) + gpu_info['clk_freq_max'] = freq_max if freq_max is not None else None # Processes nv_comp_processes = safenvml(N.nvmlDeviceGetComputeRunningProcesses)(handle) @@ -627,7 +638,7 @@ def _wrapped(*args, **kwargs): for index in gpus_to_query: try: handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index) - gpu_info = get_gpu_info(handle) + gpu_info = get_gpu_info(handle, index) gpu_stat = GPUStat(gpu_info) except N.NVMLError_Unknown as e: gpu_stat = InvalidGPU(index, "((Unknown Error))", e) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index fadc8f8..06c7033 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -12,7 +12,7 @@ from collections import namedtuple -from pyrsmi import rocml +from amdsmi import * NVML_TEMPERATURE_GPU = 1 @@ -40,25 +40,25 @@ def silent_run(to_call, *args, **kwargs): return retval def nvmlDeviceGetCount(): - return silent_run(rocml.smi_get_device_count) + return len(amdsmi_get_processor_handles()) def nvmlDeviceGetHandleByIndex(dev): - return dev + return amdsmi_get_processor_handles()[dev] def nvmlDeviceGetIndex(dev): - return dev + return -1 def nvmlDeviceGetName(dev): - return silent_run(rocml.smi_get_device_name, dev) + return amdsmi_get_gpu_board_info(dev)["product_name"] def nvmlDeviceGetUUID(dev): - return silent_run(rocml.smi_get_device_uuid, dev) + return amdsmi_get_gpu_device_uuid(dev) def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): - return silent_run(rocml.smi_get_device_temp, dev, loc) + return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT) def nvmlSystemGetDriverVersion(): - return silent_run(rocml.smi_get_kernel_version) + return "" def check_driver_nvml_version(driver_version_str: str): """Show warnings when an incompatible driver is used.""" @@ -76,18 +76,21 @@ def safeint(v) -> int: warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ") def nvmlDeviceGetFanSpeed(dev): - return silent_run(rocml.smi_get_device_fan_speed, dev) + try: + return amdsmi_get_gpu_fan_speed(dev, 0) + except Exception: + return None MemoryInfo = namedtuple('MemoryInfo', ['total', 'used']) def nvmlDeviceGetMemoryInfo(dev): - return MemoryInfo(total=silent_run(rocml.smi_get_device_memory_total, dev), - used=silent_run(rocml.smi_get_device_memory_used, dev)) + return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM), + used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM)) UtilizationRates = namedtuple('UtilizationRates', ['gpu']) def nvmlDeviceGetUtilizationRates(dev): - return UtilizationRates(gpu=silent_run(rocml.smi_get_device_utilization, dev)) + return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"]) def nvmlDeviceGetEncoderUtilization(dev): return None @@ -96,33 +99,41 @@ def nvmlDeviceGetDecoderUtilization(dev): return None def nvmlDeviceGetPowerUsage(dev): - return silent_run(rocml.smi_get_device_average_power, dev) + return amdsmi_get_power_info(dev)["current_socket_power"] def nvmlDeviceGetEnforcedPowerLimit(dev): - return None + return amdsmi_get_power_info(dev)["power_limit"] -ComputeProcess = namedtuple('ComputeProcess', ['pid']) +ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory']) def nvmlDeviceGetComputeRunningProcesses(dev): - processes = silent_run(rocml.smi_get_device_compute_process) - return [ComputeProcess(pid=i) for i in processes] + results = amdsmi_get_gpu_process_list(dev) + return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] def nvmlDeviceGetGraphicsRunningProcesses(dev): return None def nvmlDeviceGetClkFreq(dev): - return rocml.smi_get_device_freq(dev) + result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) + if "clk" in result: + return result["clk"] + else: + return result["cur_clk"] + +def nvmlDeviceGetClkFreqMax(dev): + result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) + return result["max_clk"] # Upon importing this module, let rocml be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool _init_error = None try: - rocml.smi_initialize() + amdsmi_init() _initialized = True def _shutdown(): - rocml.smi_shutdown() + amdsmi_shut_down() atexit.register(_shutdown) except Exception as exc: From c2ea30e49539561e7c13101b70e15ef287cbf405 Mon Sep 17 00:00:00 2001 From: brnelson Date: Fri, 2 Aug 2024 19:47:27 +0000 Subject: [PATCH 11/20] Fix index lookup. --- gpustat/core.py | 6 +++--- gpustat/rocml.py | 21 ++++++++++++++++----- setup.py | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index f7cdd35..e05e33e 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -481,7 +481,7 @@ def _decode(b: Union[str, bytes]) -> str: assert isinstance(b, str) return b - def get_gpu_info(handle: NVMLHandle, index: int = None) -> NvidiaGPUInfo: + def get_gpu_info(handle: NVMLHandle) -> NvidiaGPUInfo: """Get one GPU information specified by nvml handle""" def safepcall(fn: Callable[[], Any], error_value: Any): @@ -538,7 +538,7 @@ def _wrapped(*args, **kwargs): return _wrapped gpu_info = NvidiaGPUInfo() - gpu_info['index'] = N.nvmlDeviceGetIndex(handle) if index is None else index + gpu_info['index'] = N.nvmlDeviceGetIndex(handle) gpu_info['name'] = _decode(N.nvmlDeviceGetName(handle)) gpu_info['uuid'] = _decode(N.nvmlDeviceGetUUID(handle)) @@ -638,7 +638,7 @@ def _wrapped(*args, **kwargs): for index in gpus_to_query: try: handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index) - gpu_info = get_gpu_info(handle, index) + gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) except N.NVMLError_Unknown as e: gpu_stat = InvalidGPU(index, "((Unknown Error))", e) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 06c7033..3cb4c0a 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -1,4 +1,4 @@ -"""Imports pyrsmi and wraps it in a pynvml compatible interface.""" +"""Imports amdsmi and wraps it in a pynvml compatible interface.""" # pylint: disable=protected-access @@ -16,8 +16,13 @@ NVML_TEMPERATURE_GPU = 1 +class NVMLError(Exception): + def __init__(self, message="ROCM Error"): + self.message = message + super().__init__(self.message) + class NVMLError_Unknown(Exception): - def __init__(self, message="An unknown ROCMLError has occurred"): + def __init__(self, message="An unknown ROCM Error has occurred"): self.message = message super().__init__(self.message) @@ -46,6 +51,9 @@ def nvmlDeviceGetHandleByIndex(dev): return amdsmi_get_processor_handles()[dev] def nvmlDeviceGetIndex(dev): + for i, handle in enumerate(amdsmi_get_processor_handles()): + if amdsmi_get_gpu_device_bdf(dev) == amdsmi_get_gpu_device_bdf(handle): + return i return -1 def nvmlDeviceGetName(dev): @@ -107,8 +115,11 @@ def nvmlDeviceGetEnforcedPowerLimit(dev): ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory']) def nvmlDeviceGetComputeRunningProcesses(dev): - results = amdsmi_get_gpu_process_list(dev) - return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] + try: + results = amdsmi_get_gpu_process_list(dev) + return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] + except Exception: + return [] def nvmlDeviceGetGraphicsRunningProcesses(dev): return None @@ -124,7 +135,7 @@ def nvmlDeviceGetClkFreqMax(dev): result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) return result["max_clk"] -# Upon importing this module, let rocml be initialized and remain active +# Upon importing this module, let amdsmi be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool _init_error = None diff --git a/setup.py b/setup.py index 513d5ee..553cda3 100644 --- a/setup.py +++ b/setup.py @@ -91,7 +91,7 @@ def run(self): install_requires = [ 'nvidia-ml-py>=12.535.108', # see #107, #143, #161 - 'pyrsmi', #137 + 'amdsmi', #137 'psutil>=5.6.0', # GH-1447 'blessed>=1.17.1', # GH-126 'typing_extensions', From 3e0c2b13aaf9e5c549050f3983e007244dc90496 Mon Sep 17 00:00:00 2001 From: brnelson Date: Fri, 2 Aug 2024 19:59:54 +0000 Subject: [PATCH 12/20] Remove frequency stuff for now. --- gpustat/core.py | 30 ++---------------------------- gpustat/rocml.py | 12 ++++++------ 2 files changed, 8 insertions(+), 34 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index e05e33e..e251ed9 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -206,20 +206,6 @@ def processes(self) -> Optional[List[ProcessInfo]]: """Get the list of running processes on the GPU.""" return self.entry['processes'] - @property - def clk_freq(self) -> Optional[int]: - """ - """ - v = self.entry['clk_freq'] - return int(v) if v is not None else None - - @property - def clk_freq_max(self) -> Optional[int]: - """ - """ - v = self.entry['clk_freq_max'] - return int(v) if v is not None else None - def print_to(self, fp, *, with_colors=True, # deprecated arg show_cmd=False, @@ -349,12 +335,6 @@ def __getattr__(self, name): # type: ignore if show_power is True or 'limit' in show_power: _write(" / ") _write(rjustify(safe_self.power_limit, 3), ' W', color='CPowL') - - _write(", ") - _write(rjustify(safe_self.clk_freq, 3), color='CPowU') - _write(" / ") - _write(rjustify(safe_self.clk_freq_max, 3), color='CPowU') - _write(" MHz") # Memory _write(" | ") @@ -566,16 +546,10 @@ def _wrapped(*args, **kwargs): # Power power = safenvml(N.nvmlDeviceGetPowerUsage)(handle) - gpu_info['power.draw'] = power if power is not None else None + gpu_info['power.draw'] = power // 1000 if power is not None else None power_limit = safenvml(N.nvmlDeviceGetEnforcedPowerLimit)(handle) - gpu_info['enforced.power.limit'] = power_limit if power_limit is not None else None - - # Frequency - freq = safenvml(N.nvmlDeviceGetClkFreq)(handle) - gpu_info['clk_freq'] = freq if freq is not None else None - freq_max = safenvml(N.nvmlDeviceGetClkFreqMax)(handle) - gpu_info['clk_freq_max'] = freq_max if freq_max is not None else None + gpu_info['enforced.power.limit'] = power_limit // 1000 if power_limit is not None else None # Processes nv_comp_processes = safenvml(N.nvmlDeviceGetComputeRunningProcesses)(handle) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 3cb4c0a..c67e857 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -107,10 +107,10 @@ def nvmlDeviceGetDecoderUtilization(dev): return None def nvmlDeviceGetPowerUsage(dev): - return amdsmi_get_power_info(dev)["current_socket_power"] + return amdsmi_get_power_info(dev)["current_socket_power"] * 1000 def nvmlDeviceGetEnforcedPowerLimit(dev): - return amdsmi_get_power_info(dev)["power_limit"] + return amdsmi_get_power_info(dev)["power_limit"] * 1000 ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory']) @@ -124,15 +124,15 @@ def nvmlDeviceGetComputeRunningProcesses(dev): def nvmlDeviceGetGraphicsRunningProcesses(dev): return None -def nvmlDeviceGetClkFreq(dev): - result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) +def nvmlDeviceGetClockInfo(dev, clk_type=AmdSmiClkType.SYS): + result = amdsmi_get_clock_info(dev, clk_type) if "clk" in result: return result["clk"] else: return result["cur_clk"] -def nvmlDeviceGetClkFreqMax(dev): - result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS) +def nvmlDeviceGetMaxClockInfo(dev, clk_type=AmdSmiClkType.SYS): + result = amdsmi_get_clock_info(dev, clk_type) return result["max_clk"] # Upon importing this module, let amdsmi be initialized and remain active From 173d14420a7c3a8f4d2bf6a802b560a2bed88be3 Mon Sep 17 00:00:00 2001 From: brnelson Date: Fri, 2 Aug 2024 20:07:53 +0000 Subject: [PATCH 13/20] Check for amdsmi. --- gpustat/rocml.py | 30 +++++++++++++++++++++++++++++- setup.py | 1 - 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index c67e857..9e51590 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -12,7 +12,35 @@ from collections import namedtuple -from amdsmi import * + + + +try: + # Check for amdsmi. + from amdsmi import * +except (ImportError, SyntaxError, RuntimeError) as e: + _amdsmi = sys.modules.get('amdsmi', None) + + raise ImportError(textwrap.dedent( + """\ + amdsmi is missing or an outdated version is installed. + + The root cause: """ + str(e) + + """ + + Your pynvml installation: """ + repr(_amdsmi) + + """ + + ----------------------------------------------------------- + (Suggested Fix) Please install amdsmi. + It should be installed with amdgpu. But if not, please see: + https://github.com/ROCm/amdsmi#manualmultiple-rocm-instance-python-library-install + + apt install amd-smi-lib + cd /opt/rocm/share/amd_smi + python3 -m pip install --upgrade pip + python3 -m pip install --user . + """)) from e NVML_TEMPERATURE_GPU = 1 diff --git a/setup.py b/setup.py index 553cda3..c1a47f4 100644 --- a/setup.py +++ b/setup.py @@ -91,7 +91,6 @@ def run(self): install_requires = [ 'nvidia-ml-py>=12.535.108', # see #107, #143, #161 - 'amdsmi', #137 'psutil>=5.6.0', # GH-1447 'blessed>=1.17.1', # GH-126 'typing_extensions', From 800bd0dd3ed456dc36d6df0ff1d2656b98230dbf Mon Sep 17 00:00:00 2001 From: brnelson Date: Fri, 2 Aug 2024 20:16:44 +0000 Subject: [PATCH 14/20] Get driver version --- gpustat/rocml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 9e51590..c40e05d 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -94,7 +94,7 @@ def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT) def nvmlSystemGetDriverVersion(): - return "" + return amdsmi_get_gpu_driver_info(amdsmi_get_processor_handles()[0])["driver_version"] def check_driver_nvml_version(driver_version_str: str): """Show warnings when an incompatible driver is used.""" From bf1a00a4fc426e6c2b01b97519a2ce83861781da Mon Sep 17 00:00:00 2001 From: brnelson Date: Fri, 2 Aug 2024 20:23:34 +0000 Subject: [PATCH 15/20] Format new file. --- gpustat/rocml.py | 68 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index c40e05d..d44ce86 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -11,25 +11,24 @@ from collections import namedtuple - - - - try: # Check for amdsmi. from amdsmi import * except (ImportError, SyntaxError, RuntimeError) as e: - _amdsmi = sys.modules.get('amdsmi', None) + _amdsmi = sys.modules.get("amdsmi", None) - raise ImportError(textwrap.dedent( - """\ + raise ImportError( + textwrap.dedent( + """\ amdsmi is missing or an outdated version is installed. - The root cause: """ + str(e) + - """ + The root cause: """ + + str(e) + + """ - Your pynvml installation: """ + repr(_amdsmi) + - """ + Your pynvml installation: """ + + repr(_amdsmi) + + """ ----------------------------------------------------------- (Suggested Fix) Please install amdsmi. @@ -40,20 +39,25 @@ cd /opt/rocm/share/amd_smi python3 -m pip install --upgrade pip python3 -m pip install --user . - """)) from e + """ + ) + ) from e NVML_TEMPERATURE_GPU = 1 + class NVMLError(Exception): def __init__(self, message="ROCM Error"): self.message = message super().__init__(self.message) + class NVMLError_Unknown(Exception): def __init__(self, message="An unknown ROCM Error has occurred"): self.message = message super().__init__(self.message) + class NVMLError_GpuIsLost(Exception): def __init__(self, message="ROCM Device is lost."): self.message = message @@ -64,6 +68,7 @@ def __init__(self, message="ROCM Device is lost."): _stderr_dup = os.dup(2) _silent_pipe = os.open(os.devnull, os.O_WRONLY) + def silent_run(to_call, *args, **kwargs): os.dup2(_silent_pipe, 1) os.dup2(_silent_pipe, 2) @@ -72,30 +77,38 @@ def silent_run(to_call, *args, **kwargs): os.dup2(_stderr_dup, 2) return retval + def nvmlDeviceGetCount(): return len(amdsmi_get_processor_handles()) + def nvmlDeviceGetHandleByIndex(dev): return amdsmi_get_processor_handles()[dev] + def nvmlDeviceGetIndex(dev): for i, handle in enumerate(amdsmi_get_processor_handles()): if amdsmi_get_gpu_device_bdf(dev) == amdsmi_get_gpu_device_bdf(handle): return i return -1 + def nvmlDeviceGetName(dev): return amdsmi_get_gpu_board_info(dev)["product_name"] + def nvmlDeviceGetUUID(dev): return amdsmi_get_gpu_device_uuid(dev) + def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT) + def nvmlSystemGetDriverVersion(): return amdsmi_get_gpu_driver_info(amdsmi_get_processor_handles()[0])["driver_version"] + def check_driver_nvml_version(driver_version_str: str): """Show warnings when an incompatible driver is used.""" @@ -105,42 +118,51 @@ def safeint(v) -> int: except (ValueError, TypeError): return 0 - driver_version = tuple(safeint(v) for v in - driver_version_str.strip().split(".")) + driver_version = tuple(safeint(v) for v in driver_version_str.strip().split(".")) if driver_version < (6, 7, 8): warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ") + def nvmlDeviceGetFanSpeed(dev): try: return amdsmi_get_gpu_fan_speed(dev, 0) except Exception: return None -MemoryInfo = namedtuple('MemoryInfo', ['total', 'used']) + +MemoryInfo = namedtuple("MemoryInfo", ["total", "used"]) + def nvmlDeviceGetMemoryInfo(dev): - return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM), - used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM)) + return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM), used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM)) + + +UtilizationRates = namedtuple("UtilizationRates", ["gpu"]) -UtilizationRates = namedtuple('UtilizationRates', ['gpu']) def nvmlDeviceGetUtilizationRates(dev): return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"]) + def nvmlDeviceGetEncoderUtilization(dev): return None + def nvmlDeviceGetDecoderUtilization(dev): return None + def nvmlDeviceGetPowerUsage(dev): return amdsmi_get_power_info(dev)["current_socket_power"] * 1000 + def nvmlDeviceGetEnforcedPowerLimit(dev): return amdsmi_get_power_info(dev)["power_limit"] * 1000 -ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory']) + +ComputeProcess = namedtuple("ComputeProcess", ["pid", "usedGpuMemory"]) + def nvmlDeviceGetComputeRunningProcesses(dev): try: @@ -149,9 +171,11 @@ def nvmlDeviceGetComputeRunningProcesses(dev): except Exception: return [] + def nvmlDeviceGetGraphicsRunningProcesses(dev): return None + def nvmlDeviceGetClockInfo(dev, clk_type=AmdSmiClkType.SYS): result = amdsmi_get_clock_info(dev, clk_type) if "clk" in result: @@ -159,10 +183,12 @@ def nvmlDeviceGetClockInfo(dev, clk_type=AmdSmiClkType.SYS): else: return result["cur_clk"] + def nvmlDeviceGetMaxClockInfo(dev, clk_type=AmdSmiClkType.SYS): result = amdsmi_get_clock_info(dev, clk_type) return result["max_clk"] + # Upon importing this module, let amdsmi be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool @@ -173,6 +199,7 @@ def nvmlDeviceGetMaxClockInfo(dev, clk_type=AmdSmiClkType.SYS): def _shutdown(): amdsmi_shut_down() + atexit.register(_shutdown) except Exception as exc: @@ -183,6 +210,3 @@ def _shutdown(): def ensure_initialized(): if not _initialized: raise _init_error # type: ignore - - - From 6b731eb6d46254aa9450f0408a3075b4345f98ed Mon Sep 17 00:00:00 2001 From: brnelson Date: Mon, 5 Aug 2024 20:07:23 +0000 Subject: [PATCH 16/20] Typo. --- gpustat/rocml.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index d44ce86..8880686 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -26,7 +26,7 @@ + str(e) + """ - Your pynvml installation: """ + Your amdsmi installation: """ + repr(_amdsmi) + """ @@ -165,11 +165,8 @@ def nvmlDeviceGetEnforcedPowerLimit(dev): def nvmlDeviceGetComputeRunningProcesses(dev): - try: - results = amdsmi_get_gpu_process_list(dev) - return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] - except Exception: - return [] + results = amdsmi_get_gpu_process_list(dev) + return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] def nvmlDeviceGetGraphicsRunningProcesses(dev): From 1a0922255d255fca4455b193c349b790e88814b7 Mon Sep 17 00:00:00 2001 From: brnelson Date: Thu, 8 Aug 2024 14:12:28 +0000 Subject: [PATCH 17/20] Switch to rocmi. --- gpustat/rocml.py | 127 ++++++++++++++++++++++------------------------- setup.py | 1 + 2 files changed, 61 insertions(+), 67 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 8880686..509dd3c 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -1,4 +1,4 @@ -"""Imports amdsmi and wraps it in a pynvml compatible interface.""" +"""Imports rocmi and wraps it in a pynvml compatible interface.""" # pylint: disable=protected-access @@ -12,33 +12,26 @@ from collections import namedtuple try: - # Check for amdsmi. - from amdsmi import * + # Check for rocmi. + import rocmi except (ImportError, SyntaxError, RuntimeError) as e: - _amdsmi = sys.modules.get("amdsmi", None) + _rocmi = sys.modules.get("rocmi", None) raise ImportError( textwrap.dedent( """\ - amdsmi is missing or an outdated version is installed. + rocmi is missing or an outdated version is installed. The root cause: """ + str(e) + """ - Your amdsmi installation: """ - + repr(_amdsmi) + Your rocmi installation: """ + + repr(_rocmi) + """ ----------------------------------------------------------- - (Suggested Fix) Please install amdsmi. - It should be installed with amdgpu. But if not, please see: - https://github.com/ROCm/amdsmi#manualmultiple-rocm-instance-python-library-install - - apt install amd-smi-lib - cd /opt/rocm/share/amd_smi - python3 -m pip install --upgrade pip - python3 -m pip install --user . + (Suggested Fix) Please install rocmi using pip. """ ) ) from e @@ -64,49 +57,37 @@ def __init__(self, message="ROCM Device is lost."): super().__init__(self.message) -_stdout_dup = os.dup(1) -_stderr_dup = os.dup(2) -_silent_pipe = os.open(os.devnull, os.O_WRONLY) - - -def silent_run(to_call, *args, **kwargs): - os.dup2(_silent_pipe, 1) - os.dup2(_silent_pipe, 2) - retval = to_call(*args, **kwargs) - os.dup2(_stdout_dup, 1) - os.dup2(_stderr_dup, 2) - return retval - - def nvmlDeviceGetCount(): - return len(amdsmi_get_processor_handles()) + return len(rocmi.get_devices()) def nvmlDeviceGetHandleByIndex(dev): - return amdsmi_get_processor_handles()[dev] + return rocmi.get_devices()[dev] -def nvmlDeviceGetIndex(dev): - for i, handle in enumerate(amdsmi_get_processor_handles()): - if amdsmi_get_gpu_device_bdf(dev) == amdsmi_get_gpu_device_bdf(handle): +def nvmlDeviceGetIndex(handle): + for i, d in enumerate(rocmi.get_devices()): + if d.bus_id == handle.bus_id: return i + return -1 -def nvmlDeviceGetName(dev): - return amdsmi_get_gpu_board_info(dev)["product_name"] +def nvmlDeviceGetName(handle): + return handle.name -def nvmlDeviceGetUUID(dev): - return amdsmi_get_gpu_device_uuid(dev) +def nvmlDeviceGetUUID(handle): + return handle.unique_id -def nvmlDeviceGetTemperature(dev, loc=NVML_TEMPERATURE_GPU): - return amdsmi_get_temp_metric(dev, AmdSmiTemperatureType.HOTSPOT, AmdSmiTemperatureMetric.CURRENT) +def nvmlDeviceGetTemperature(handle, loc=NVML_TEMPERATURE_GPU): + metrics = handle.get_metrics() + return metrics.temperature_hotspot def nvmlSystemGetDriverVersion(): - return amdsmi_get_gpu_driver_info(amdsmi_get_processor_handles()[0])["driver_version"] + return "" def check_driver_nvml_version(driver_version_str: str): @@ -120,29 +101,38 @@ def safeint(v) -> int: driver_version = tuple(safeint(v) for v in driver_version_str.strip().split(".")) + if len(driver_version) == 0 or driver_version <= (0,): + return if driver_version < (6, 7, 8): warnings.warn(f"This version of ROCM Driver {driver_version_str} is untested, ") -def nvmlDeviceGetFanSpeed(dev): +def nvmlDeviceGetFanSpeed(handle): try: - return amdsmi_get_gpu_fan_speed(dev, 0) - except Exception: + speed = handle.get_metrics().current_fan_speed + except AttributeError: return None + return speed + MemoryInfo = namedtuple("MemoryInfo", ["total", "used"]) -def nvmlDeviceGetMemoryInfo(dev): - return MemoryInfo(total=amdsmi_get_gpu_memory_total(dev, AmdSmiMemoryType.VRAM), used=amdsmi_get_gpu_memory_usage(dev, AmdSmiMemoryType.VRAM)) +def nvmlDeviceGetMemoryInfo(handle): + + return MemoryInfo( + total=handle.vram_total, + used=handle.vram_used, + ) UtilizationRates = namedtuple("UtilizationRates", ["gpu"]) -def nvmlDeviceGetUtilizationRates(dev): - return UtilizationRates(gpu=amdsmi_get_gpu_activity(dev)["gfx_activity"]) +def nvmlDeviceGetUtilizationRates(handle): + metrics = handle.get_metrics() + return UtilizationRates(gpu=metrics.average_gfx_activity) def nvmlDeviceGetEncoderUtilization(dev): @@ -153,49 +143,52 @@ def nvmlDeviceGetDecoderUtilization(dev): return None -def nvmlDeviceGetPowerUsage(dev): - return amdsmi_get_power_info(dev)["current_socket_power"] * 1000 +def nvmlDeviceGetPowerUsage(handle): + return handle.current_power / 1000000 -def nvmlDeviceGetEnforcedPowerLimit(dev): - return amdsmi_get_power_info(dev)["power_limit"] * 1000 +def nvmlDeviceGetEnforcedPowerLimit(handle): + return handle.power_limit / 1000000 ComputeProcess = namedtuple("ComputeProcess", ["pid", "usedGpuMemory"]) -def nvmlDeviceGetComputeRunningProcesses(dev): - results = amdsmi_get_gpu_process_list(dev) - return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results] +def nvmlDeviceGetComputeRunningProcesses(handle): + results = handle.get_processes() + return [ComputeProcess(pid=x.pid, usedGpuMemory=x.vram_usage) for x in results] def nvmlDeviceGetGraphicsRunningProcesses(dev): return None -def nvmlDeviceGetClockInfo(dev, clk_type=AmdSmiClkType.SYS): - result = amdsmi_get_clock_info(dev, clk_type) - if "clk" in result: - return result["clk"] - else: - return result["cur_clk"] +def nvmlDeviceGetClockInfo(handle): + metrics = handle.get_metrics() + + try: + clk = metrics.current_gfxclks[0] + except AttributeError: + clk = metrics.current_gfxclk + + return clk -def nvmlDeviceGetMaxClockInfo(dev, clk_type=AmdSmiClkType.SYS): - result = amdsmi_get_clock_info(dev, clk_type) - return result["max_clk"] +def nvmlDeviceGetMaxClockInfo(handle): + return handle.get_clock_info()[-1] -# Upon importing this module, let amdsmi be initialized and remain active +# Upon importing this module, let rocmi be initialized and remain active # throughout the lifespan of the python process (until gpustat exists). _initialized: bool _init_error = None try: - amdsmi_init() + # rocmi_init() No init required. _initialized = True def _shutdown(): - amdsmi_shut_down() + # rocmi_shut_down() No shutdown required. + pass atexit.register(_shutdown) diff --git a/setup.py b/setup.py index c1a47f4..f3e6a7d 100644 --- a/setup.py +++ b/setup.py @@ -91,6 +91,7 @@ def run(self): install_requires = [ 'nvidia-ml-py>=12.535.108', # see #107, #143, #161 + 'rocmi>=0.2', # see #137 'psutil>=5.6.0', # GH-1447 'blessed>=1.17.1', # GH-126 'typing_extensions', From dfce6995edcc44eec8e1efe0b7ac2ad111852c83 Mon Sep 17 00:00:00 2001 From: brnelson Date: Thu, 8 Aug 2024 14:56:33 +0000 Subject: [PATCH 18/20] Cleanup unneeded code. --- gpustat/core.py | 2 +- gpustat/rocml.py | 28 ++-------------------------- gpustat/util.py | 2 +- 3 files changed, 4 insertions(+), 28 deletions(-) diff --git a/gpustat/core.py b/gpustat/core.py index e251ed9..2e43fbb 100644 --- a/gpustat/core.py +++ b/gpustat/core.py @@ -31,7 +31,7 @@ from gpustat import util -if util.hasAMD(): +if util.has_AMD(): from gpustat import rocml as nvml from gpustat import rocml as N from gpustat.rocml import check_driver_nvml_version diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 509dd3c..bfe07a9 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -1,10 +1,5 @@ """Imports rocmi and wraps it in a pynvml compatible interface.""" -# pylint: disable=protected-access - -import atexit -import functools -import os import sys import textwrap import warnings @@ -178,25 +173,6 @@ def nvmlDeviceGetMaxClockInfo(handle): return handle.get_clock_info()[-1] -# Upon importing this module, let rocmi be initialized and remain active -# throughout the lifespan of the python process (until gpustat exists). -_initialized: bool -_init_error = None -try: - # rocmi_init() No init required. - _initialized = True - - def _shutdown(): - # rocmi_shut_down() No shutdown required. - pass - - atexit.register(_shutdown) - -except Exception as exc: - _initialized = False - _init_error = exc - - +# rocmi does not require initialization def ensure_initialized(): - if not _initialized: - raise _init_error # type: ignore + pass diff --git a/gpustat/util.py b/gpustat/util.py index e6b0067..22f6fdd 100644 --- a/gpustat/util.py +++ b/gpustat/util.py @@ -104,7 +104,7 @@ def report_summary(self, concise=True): self._write('') -def hasAMD(): +def has_AMD(): try: subprocess.check_output('rocm-smi') return True From f1abc19fffae43a3eb020ce984ff919f46b3b244 Mon Sep 17 00:00:00 2001 From: brnelson Date: Thu, 8 Aug 2024 17:03:27 +0000 Subject: [PATCH 19/20] Add driver version. --- gpustat/rocml.py | 5 ++++- setup.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index bfe07a9..9679283 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -82,7 +82,10 @@ def nvmlDeviceGetTemperature(handle, loc=NVML_TEMPERATURE_GPU): def nvmlSystemGetDriverVersion(): - return "" + retval = rocmi.get_driver_version() + if retval is None: + return "" + return retval def check_driver_nvml_version(driver_version_str: str): diff --git a/setup.py b/setup.py index f3e6a7d..f2f908e 100644 --- a/setup.py +++ b/setup.py @@ -91,7 +91,7 @@ def run(self): install_requires = [ 'nvidia-ml-py>=12.535.108', # see #107, #143, #161 - 'rocmi>=0.2', # see #137 + 'rocmi>=0.3', # see #137 'psutil>=5.6.0', # GH-1447 'blessed>=1.17.1', # GH-126 'typing_extensions', From 9a2e2af8b8418ded2ea4cfde54a37062a43338a6 Mon Sep 17 00:00:00 2001 From: brnelson Date: Thu, 8 Aug 2024 18:40:05 +0000 Subject: [PATCH 20/20] Fix power divisor. --- gpustat/rocml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpustat/rocml.py b/gpustat/rocml.py index 9679283..1a6ddf3 100644 --- a/gpustat/rocml.py +++ b/gpustat/rocml.py @@ -142,11 +142,11 @@ def nvmlDeviceGetDecoderUtilization(dev): def nvmlDeviceGetPowerUsage(handle): - return handle.current_power / 1000000 + return handle.current_power / 1000 def nvmlDeviceGetEnforcedPowerLimit(handle): - return handle.power_limit / 1000000 + return handle.power_limit / 1000 ComputeProcess = namedtuple("ComputeProcess", ["pid", "usedGpuMemory"])