GPU测试

This commit is contained in:
2026-01-20 10:54:30 +08:00
parent 8463f5a571
commit 8e9de9c858
59 changed files with 18934 additions and 0 deletions

View File

@@ -0,0 +1,244 @@
"""
指标采集模块
"""
import time
import threading
from typing import Dict, Any, List, Optional
from dataclasses import dataclass, field
from .utils import setup_logging, calculate_statistics
logger = setup_logging()
@dataclass
class GPUMetrics:
"""GPU 指标"""
gpu_utilization: float = 0.0
memory_used_mb: float = 0.0
memory_total_mb: float = 0.0
memory_utilization: float = 0.0
temperature: float = 0.0
power_draw_w: float = 0.0
sm_clock_mhz: float = 0.0
memory_clock_mhz: float = 0.0
@dataclass
class PerformanceMetrics:
"""性能指标"""
total_frames: int = 0
total_batches: int = 0
dropped_frames: int = 0
latencies_ms: List[float] = field(default_factory=list)
throughput_fps: float = 0.0
def get_latency_stats(self) -> Dict[str, float]:
return calculate_statistics(self.latencies_ms)
class GPUMonitor:
"""GPU 监控器"""
def __init__(self, device_id: int = 0, sample_interval_ms: int = 100):
self.device_id = device_id
self.sample_interval_ms = sample_interval_ms
self._running = False
self._thread: Optional[threading.Thread] = None
self._samples: List[GPUMetrics] = []
self._lock = threading.Lock()
self._init_nvml()
def _init_nvml(self):
self._nvml_available = False
try:
import pynvml
pynvml.nvmlInit()
self._nvml = pynvml
self._handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
name = pynvml.nvmlDeviceGetName(self._handle)
if isinstance(name, bytes):
name = name.decode("utf-8")
memory_info = pynvml.nvmlDeviceGetMemoryInfo(self._handle)
total_memory_mb = memory_info.total / (1024 * 1024)
logger.info(f"GPU 监控初始化: {name}, 显存: {total_memory_mb:.0f} MB")
self._nvml_available = True
self._total_memory_mb = total_memory_mb
except ImportError:
logger.warning("pynvml 未安装GPU 监控不可用")
except Exception as e:
logger.warning(f"NVML 初始化失败: {e}")
def _sample(self) -> GPUMetrics:
if not self._nvml_available:
return GPUMetrics()
try:
util = self._nvml.nvmlDeviceGetUtilizationRates(self._handle)
gpu_util = util.gpu
mem_info = self._nvml.nvmlDeviceGetMemoryInfo(self._handle)
mem_used_mb = mem_info.used / (1024 * 1024)
mem_total_mb = mem_info.total / (1024 * 1024)
mem_util = (mem_info.used / mem_info.total) * 100
try:
temp = self._nvml.nvmlDeviceGetTemperature(self._handle, self._nvml.NVML_TEMPERATURE_GPU)
except:
temp = 0.0
try:
power = self._nvml.nvmlDeviceGetPowerUsage(self._handle) / 1000
except:
power = 0.0
return GPUMetrics(
gpu_utilization=gpu_util,
memory_used_mb=mem_used_mb,
memory_total_mb=mem_total_mb,
memory_utilization=mem_util,
temperature=temp,
power_draw_w=power,
)
except Exception as e:
logger.warning(f"GPU 指标采集失败: {e}")
return GPUMetrics()
def _monitor_loop(self):
interval_sec = self.sample_interval_ms / 1000
while self._running:
metrics = self._sample()
with self._lock:
self._samples.append(metrics)
time.sleep(interval_sec)
def start(self):
if self._running:
return
self._running = True
self._samples.clear()
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
logger.info("GPU 监控已启动")
def stop(self):
self._running = False
if self._thread:
self._thread.join(timeout=2.0)
logger.info("GPU 监控已停止")
def get_samples(self) -> List[GPUMetrics]:
with self._lock:
return list(self._samples)
def get_statistics(self) -> Dict[str, Any]:
samples = self.get_samples()
if not samples:
return {}
gpu_utils = [s.gpu_utilization for s in samples]
mem_utils = [s.memory_utilization for s in samples]
mem_used = [s.memory_used_mb for s in samples]
return {
"gpu_utilization": calculate_statistics(gpu_utils),
"memory_utilization": calculate_statistics(mem_utils),
"memory_used_mb": calculate_statistics(mem_used),
"sample_count": len(samples),
}
def clear(self):
with self._lock:
self._samples.clear()
class MetricsCollector:
"""性能指标采集器"""
def __init__(self, device_id: int = 0, sample_interval_ms: int = 100):
self.device_id = device_id
self.sample_interval_ms = sample_interval_ms
self.gpu_monitor = GPUMonitor(device_id, sample_interval_ms)
self._perf_metrics = PerformanceMetrics()
self._lock = threading.Lock()
self._start_time: Optional[float] = None
self._end_time: Optional[float] = None
def start(self):
self._start_time = time.time()
self._perf_metrics = PerformanceMetrics()
self.gpu_monitor.start()
def stop(self):
self._end_time = time.time()
self.gpu_monitor.stop()
def record_inference(self, latency_ms: float, batch_size: int):
with self._lock:
self._perf_metrics.latencies_ms.append(latency_ms)
self._perf_metrics.total_batches += 1
self._perf_metrics.total_frames += batch_size
def record_frame_drop(self, count: int = 1):
with self._lock:
self._perf_metrics.dropped_frames += count
def get_gpu_metrics(self) -> Dict[str, Any]:
return self.gpu_monitor.get_statistics()
def get_throughput_metrics(self) -> Dict[str, Any]:
with self._lock:
perf = self._perf_metrics
if self._start_time and self._end_time:
duration = self._end_time - self._start_time
elif self._start_time:
duration = time.time() - self._start_time
else:
duration = 0
throughput_fps = perf.total_frames / duration if duration > 0 else 0
total_expected = perf.total_frames + perf.dropped_frames
drop_rate = perf.dropped_frames / total_expected * 100 if total_expected > 0 else 0
latency_stats = perf.get_latency_stats()
return {
"total_frames": perf.total_frames,
"total_batches": perf.total_batches,
"dropped_frames": perf.dropped_frames,
"duration_sec": duration,
"throughput_fps": throughput_fps,
"frame_drop_rate": drop_rate,
"latency": latency_stats,
}
def get_all_metrics(self) -> Dict[str, Any]:
return {
"gpu": self.get_gpu_metrics(),
"throughput": self.get_throughput_metrics(),
}
def reset(self):
with self._lock:
self._perf_metrics = PerformanceMetrics()
self.gpu_monitor.clear()
self._start_time = None
self._end_time = None