Merge work done by @meatposes

This commit is contained in:
nicolargo
2025-12-20 16:00:14 +01:00
parent f587b281c0
commit 8eea421faf
3 changed files with 303 additions and 2 deletions

View File

@@ -182,6 +182,10 @@ mem_critical=90
temperature_careful=60
temperature_warning=70
temperature_critical=80
# Ignore specific GPU devices by ID (comma-separated)
# Use 'xpu-smi discovery' to see device IDs
# Example: ignore device 2 (typically the iGPU)
#ignore_devices=2
[mem]
disable=False

View File

@@ -3,6 +3,7 @@
#
# Copyright (C) 2020 Kirby Banman <kirby.banman@gmail.com>
# Copyright (C) 2024 Nicolas Hennion <nicolashennion@gmail.com>
# Intel GPU support added (poorly) 2025 by <computerdork@verion.net>
#
# SPDX-License-Identifier: LGPL-3.0-only
#
@@ -12,11 +13,13 @@
Currently supported:
- NVIDIA GPU (need pynvml lib)
- AMD GPU (no lib needed)
- Intel GPU (need xpumcli, requires root/sudo right for utilization)
"""
from glances.globals import to_fahrenheit
from glances.logger import logger
from glances.plugins.gpu.cards.amd import AmdGPU
from glances.plugins.gpu.cards.intel import IntelGPU
from glances.plugins.gpu.cards.nvidia import NvidiaGPU
from glances.plugins.plugin.model import GlancesPluginModel
@@ -90,13 +93,24 @@ class GpuPlugin(GlancesPluginModel):
logger.debug(f'AMD GPU initialization error: {e}')
self.amd = None
# Init the Intel GPU API
try:
self.intel = IntelGPU(config=config)
except Exception as e:
logger.debug(f'Intel GPU initialization error: {e}')
self.intel = None
# We want to display the stat in the curse interface
self.display_curse = True
def exit(self):
"""Overwrite the exit method to close the GPU API."""
self.nvidia.exit()
self.amd.exit()
if self.nvidia:
self.nvidia.exit()
if self.amd:
self.amd.exit()
if self.intel:
self.intel.exit()
# Call the father exit method
super().exit()
@@ -117,6 +131,8 @@ class GpuPlugin(GlancesPluginModel):
stats.extend(self.nvidia.get_device_stats())
if self.amd:
stats.extend(self.amd.get_device_stats())
if self.intel:
stats.extend(self.intel.get_device_stats())
# !!!
# Uncomment to test on computer without Nvidia GPU

View File

@@ -0,0 +1,281 @@
#
# This file is part of Glances.
#
# Intel GPU support added (poorly) 2025 by <computerdork@verion.net>
#
# SPDX-License-Identifier: LGPL-3.0-only
#
"""Intel GPU card for Glances."""
import glob
import json
import os
import re
import subprocess
import time
from collections import defaultdict
from glances.logger import logger
class IntelGPU:
"""Intel GPU card (Arc, Xe) using xpumcli + fdinfo."""
def __init__(self, config=None):
"""Init Intel GPU detection."""
self.ready = False
self.device_count = 0
self.pci_to_id = {}
self.fdinfo_last = {}
self.config = config
# Parse ignore_devices from config
self.ignore_devices = set()
if config:
try:
ignore_str = config.get_value('gpu', 'ignore_devices', default='')
if ignore_str:
self.ignore_devices = {int(x.strip()) for x in ignore_str.split(',') if x.strip()}
logger.debug(f"Intel GPU ignoring devices: {self.ignore_devices}")
except Exception as e:
logger.debug(f"Error parsing ignore_devices: {e}")
# Detect which command is available: xpu-smi (newer) or xpumcli (older)
self.xpumcli_cmd = None
for cmd in ['xpu-smi', 'xpumcli']:
try:
result = subprocess.run([cmd, '--version'], capture_output=True, timeout=2)
if result.returncode == 0:
self.xpumcli_cmd = cmd
logger.debug(f"Found Intel GPU tool: {cmd}")
break
except (subprocess.TimeoutExpired, FileNotFoundError):
continue
if not self.xpumcli_cmd:
logger.debug("Neither xpu-smi nor xpumcli found, Intel GPU support disabled")
return
# Get Intel GPU device list
try:
result = subprocess.run([self.xpumcli_cmd, 'discovery', '-j'], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
data = json.loads(result.stdout)
devices = data.get('device_list', [])
self.device_count = len(devices)
# Build PCI address mapping
for device in devices:
device_id = device.get('device_id')
pci_addr = device.get('pci_bdf_address', '').lower()
if device_id is not None and pci_addr:
self.pci_to_id[pci_addr] = device_id
if self.device_count > 0:
self.ready = True
logger.debug(f"Intel GPU support initialized: {self.device_count} device(s)")
except Exception as e:
logger.debug(f"Intel GPU initialization failed: {e}")
def get_device_stats(self):
"""Get Intel GPU stats.
Returns list of dicts with GPU stats.
"""
if not self.ready:
return []
stats = []
# Get GPU utilization from fdinfo
intel_util = self._get_fdinfo_utilization()
# Query each Intel GPU
for xpu_device_id in range(self.device_count):
# Skip ignored devices
if xpu_device_id in self.ignore_devices:
logger.debug(f"Skipping ignored Intel GPU device {xpu_device_id}")
continue
try:
result = subprocess.run(
[self.xpumcli_cmd, 'stats', '-j', '-d', str(xpu_device_id)],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode != 0:
continue
data = json.loads(result.stdout)
device_level = data.get('device_level', [])
device_stats = {
'key': 'gpu_id',
'gpu_id': f'intel{xpu_device_id}',
'name': self._get_device_name(xpu_device_id),
'mem': self._extract_metric(device_level, 'XPUM_STATS_MEMORY_UTILIZATION'),
'proc': intel_util.get(xpu_device_id, 0.0),
'temperature': self._extract_metric(device_level, 'XPUM_STATS_MEMORY_TEMPERATURE'),
'fan_speed': None, # Not available
}
# Set None for invalid values
if device_stats['mem'] <= 0:
device_stats['mem'] = None
if device_stats['temperature'] <= 0:
device_stats['temperature'] = None
stats.append(device_stats)
except Exception as e:
logger.debug(f"Error getting Intel GPU {xpu_device_id} stats: {e}")
continue
return stats
def _get_device_name(self, device_id):
"""Get Intel GPU device name."""
try:
result = subprocess.run([self.xpumcli_cmd, 'discovery', '-j'], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
data = json.loads(result.stdout)
for device in data.get('device_list', []):
if device.get('device_id') == device_id:
name = device.get('device_name', 'Intel GPU')
# Clean up name
name = name.replace('Intel(R) ', '').replace('Graphics ', '')
if not name or name == 'Graphics':
# Fallback to PCI device ID
pci_id = device.get('pci_device_id', '')
if pci_id.startswith('0x'):
name = pci_id[2:]
else:
name = 'Intel GPU'
return name
except Exception:
pass
return 'Intel GPU'
def _extract_metric(self, device_level, metric_type):
"""Extract metric from xpumcli device_level array."""
for metric in device_level:
if metric.get('metrics_type') == metric_type:
return metric.get('value', 0)
return 0
def _get_fdinfo_utilization(self):
"""Get Intel GPU utilization from /proc/*/fdinfo/*.
Returns dict of {device_id: utilization_percent}
Requires root/CAP_SYS_PTRACE to see all processes.
"""
current_time = time.time()
# Find all processes with GPU access
pci_to_cycles = defaultdict(lambda: defaultdict(int))
for proc_dir in glob.glob('/proc/[0-9]*'):
try:
fdinfo_dir = os.path.join(proc_dir, 'fdinfo')
if not os.path.exists(fdinfo_dir):
continue
for fdinfo_file in os.listdir(fdinfo_dir):
fdinfo_path = os.path.join(fdinfo_dir, fdinfo_file)
try:
with open(fdinfo_path) as f:
content = f.read()
# Check for Intel GPU
pci_match = re.search(r'drm-pdev:\s*([0-9a-f:\.]+)', content)
if not pci_match or 'drm-cycles-' not in content:
continue
pci_addr = pci_match.group(1).lower()
# Only process Intel GPUs we know about
if pci_addr not in self.pci_to_id:
continue
# Parse engine cycles
cycles_pattern = re.compile(r'drm-cycles-(\w+):\s+(\d+)')
total_cycles_pattern = re.compile(r'drm-total-cycles-(\w+):\s+(\d+)')
for match in cycles_pattern.finditer(content):
engine = match.group(1)
value = int(match.group(2))
pci_to_cycles[pci_addr][engine + '_cycles'] += value
for match in total_cycles_pattern.finditer(content):
engine = match.group(1)
value = int(match.group(2))
key = engine + '_total'
pci_to_cycles[pci_addr][key] = max(pci_to_cycles[pci_addr][key], value)
except (OSError, PermissionError):
continue
except (ValueError, OSError, PermissionError):
continue
# Calculate utilization
utilization = {}
for pci_addr, cycles in pci_to_cycles.items():
device_id = self.pci_to_id.get(pci_addr)
if device_id is None:
continue
# Check if we have a previous measurement
if pci_addr not in self.fdinfo_last:
# First measurement - store baseline
self.fdinfo_last[pci_addr] = {'cycles': dict(cycles), 'time': current_time}
utilization[device_id] = 0.0
continue
last = self.fdinfo_last[pci_addr]
time_delta = current_time - last['time']
if time_delta < 0.1:
utilization[device_id] = 0.0
continue
# Calculate max utilization across all engines
max_util = 0.0
engines = {k.replace('_cycles', '').replace('_total', '') for k in cycles.keys()}
for engine in engines:
curr_cycles = cycles.get(engine + '_cycles', 0)
curr_total = cycles.get(engine + '_total', 0)
prev_cycles = last['cycles'].get(engine + '_cycles', 0)
prev_total = last['cycles'].get(engine + '_total', 0)
delta_cycles = curr_cycles - prev_cycles
delta_total = curr_total - prev_total
if delta_total > 0:
engine_util = (delta_cycles / delta_total) * 100.0
max_util = max(max_util, engine_util)
utilization[device_id] = min(100.0, max(0.0, max_util))
# Update last measurement
self.fdinfo_last[pci_addr] = {'cycles': dict(cycles), 'time': current_time}
# Fill in 0% for devices with no activity
for device_id in range(self.device_count):
if device_id not in utilization:
utilization[device_id] = 0.0
return utilization
def exit(self):
"""Cleanup (Intel GPU is stateless)."""
pass