当前位置: 首页 > news >正文

一个简单的GPU压力测试脚本-python版

import torch
import time
import threading
import argparse
import math
import randomclass SimpleGPUStress:def __init__(self, gpu_ids=None, target_usage=80, fluctuation=20, memory_limit=85, matrix_size=2048):"""简洁版GPU压力测试Args:gpu_ids: GPU ID列表,如[0,1]或None(使用所有GPU)target_usage: 目标使用率% (默认80%)fluctuation: 波动范围% (默认±20%)memory_limit: 显存使用上限% (默认85%)matrix_size: 基础矩阵大小 (默认2048)"""# 检查CUDAif not torch.cuda.is_available():raise RuntimeError("CUDA不可用")# 设置GPU列表if gpu_ids is None:self.gpu_ids = list(range(torch.cuda.device_count()))else:self.gpu_ids = gpu_idsself.target_usage = target_usage / 100.0self.fluctuation = fluctuation / 100.0self.memory_limit = memory_limit / 100.0self.matrix_size = matrix_sizeself.running = True# 获取每个GPU的显存信息self.gpu_memory_info = {}for gpu_id in self.gpu_ids:total_memory = torch.cuda.get_device_properties(gpu_id).total_memorymax_memory = total_memory * self.memory_limitself.gpu_memory_info[gpu_id] = {'total_gb': total_memory / 1024**3,'max_gb': max_memory / 1024**3}print(f"使用GPU: {self.gpu_ids}")print(f"目标使用率: {target_usage}% ±{fluctuation}%")print(f"显存限制: {memory_limit}%")for gpu_id in self.gpu_ids:info = self.gpu_memory_info[gpu_id]print(f"GPU {gpu_id}: 总显存 {info['total_gb']:.1f}GB, 限制 {info['max_gb']:.1f}GB")def get_current_target(self, start_time):"""计算当前目标强度(正弦波浮动)"""elapsed = time.time() - start_timewave = math.sin(elapsed / 30 * 2 * math.pi)  # 30秒周期target = self.target_usage + wave * self.fluctuationreturn max(0.1, min(1.0, target))def manage_memory(self, gpu_id, memory_pool):"""管理显存使用量"""torch.cuda.set_device(gpu_id)current_memory = torch.cuda.memory_allocated(gpu_id) / 1024**3target_memory = self.gpu_memory_info[gpu_id]['max_gb'] * 0.8  # 目标80%if current_memory < target_memory and len(memory_pool) < 10:# 增加显存使用try:size = random.randint(1024, 3072)matrix = torch.randn(size, size, device=f'cuda:{gpu_id}')memory_pool.append(matrix)except RuntimeError:pass  # 显存不足时忽略elif current_memory > target_memory and len(memory_pool) > 1:# 减少显存使用if memory_pool:memory_pool.pop()torch.cuda.empty_cache()def worker(self, gpu_id):"""GPU工作线程"""torch.cuda.set_device(gpu_id)device = f'cuda:{gpu_id}'# 创建工作矩阵a = torch.randn(self.matrix_size, self.matrix_size, device=device)b = torch.randn(self.matrix_size, self.matrix_size, device=device)# 显存管理池memory_pool = []start_time = time.time()iteration = 0print(f"GPU {gpu_id} 工作线程启动")while self.running:try:# 获取当前目标强度target_intensity = self.get_current_target(start_time)# 每100次迭代管理一次显存if iteration % 100 == 0:self.manage_memory(gpu_id, memory_pool)# 根据强度决定是否执行计算if random.random() < target_intensity:# 执行矩阵计算c = torch.mm(a, b)# 根据强度调整计算复杂度if target_intensity > 0.7:c = torch.relu(c)c = torch.sigmoid(c)c = torch.tanh(c)elif target_intensity > 0.4:c = torch.relu(c)c = torch.sigmoid(c)else:c = torch.relu(c)torch.cuda.synchronize()# 动态休眠sleep_time = (1 - target_intensity) * 0.01if sleep_time > 0:time.sleep(sleep_time)iteration += 1except RuntimeError as e:if "out of memory" in str(e):# 显存不足时清理memory_pool.clear()torch.cuda.empty_cache()print(f"GPU {gpu_id} 显存不足,已清理")else:print(f"GPU {gpu_id} 错误: {e}")breakprint(f"GPU {gpu_id} 工作线程退出")def monitor(self):"""监控线程"""start_time = time.time()while self.running:try:elapsed = time.time() - start_timetarget = self.get_current_target(start_time)print(f"\n时间: {elapsed:.0f}s | 目标强度: {target:.2f}")for gpu_id in self.gpu_ids:memory_used = torch.cuda.memory_allocated(gpu_id) / 1024**3memory_total = self.gpu_memory_info[gpu_id]['total_gb']memory_percent = (memory_used / memory_total) * 100print(f"GPU {gpu_id}: 显存 {memory_used:.1f}GB/{memory_total:.1f}GB ({memory_percent:.1f}%)")time.sleep(3)  # 每3秒输出一次except KeyboardInterrupt:breakdef start(self):"""启动压力测试"""print("\n开始GPU压力测试,按Ctrl+C停止...\n")threads = []try:# 启动工作线程for gpu_id in self.gpu_ids:t = threading.Thread(target=self.worker, args=(gpu_id,))t.daemon = Truethreads.append(t)t.start()# 启动监控线程monitor_thread = threading.Thread(target=self.monitor)monitor_thread.daemon = Truemonitor_thread.start()# 等待while self.running:time.sleep(0.1)except KeyboardInterrupt:print("\n正在停止...")self.running = False# 等待线程结束for t in threads:t.join(timeout=1)# 清理显存for gpu_id in self.gpu_ids:torch.cuda.set_device(gpu_id)torch.cuda.empty_cache()print("已停止")def parse_gpu_ids(gpu_str):"""解析GPU ID"""if gpu_str.lower() == 'all':return Nonereturn [int(x) for x in gpu_str.split(',')]def main():parser = argparse.ArgumentParser(description='简洁版GPU压力测试')parser.add_argument('--gpu', '-g', type=parse_gpu_ids, default='0',help='GPU ID,如: 0 或 0,1 或 all')parser.add_argument('--target', '-t', type=int, default=80,help='目标使用率% (默认80)')parser.add_argument('--fluctuation', '-f', type=int, default=20,help='波动范围% (默认±20)')parser.add_argument('--memory-limit', '-m', type=int, default=85,help='显存使用上限% (默认85)')parser.add_argument('--matrix-size', '-s', type=int, default=2048 * 2,help='矩阵大小 (默认2048)')args = parser.parse_args()try:stress_test = SimpleGPUStress(gpu_ids=args.gpu,target_usage=args.target,fluctuation=args.fluctuation,memory_limit=args.memory_limit,matrix_size=args.matrix_size)stress_test.start()except Exception as e:print(f"错误: {e}")if __name__ == "__main__":main()

文章转载自:

http://NjUBANxv.bdwqy.cn
http://elE9Ov1C.bdwqy.cn
http://Hf6YUS4N.bdwqy.cn
http://IHASoMXb.bdwqy.cn
http://cS9hEjVM.bdwqy.cn
http://c4HXthd5.bdwqy.cn
http://Q16Yr38j.bdwqy.cn
http://5joWZ225.bdwqy.cn
http://4obaf7Li.bdwqy.cn
http://VZHRJkdX.bdwqy.cn
http://cs38Osv7.bdwqy.cn
http://w8OdKEtT.bdwqy.cn
http://2HbNvSNm.bdwqy.cn
http://BZ0eJMUY.bdwqy.cn
http://A19KfwXe.bdwqy.cn
http://7Zb8rlYp.bdwqy.cn
http://lDDTn5gP.bdwqy.cn
http://bPM1zkJZ.bdwqy.cn
http://QbXHnz4C.bdwqy.cn
http://y5uP0dhB.bdwqy.cn
http://XAR56Gpl.bdwqy.cn
http://B5onct1x.bdwqy.cn
http://fH71jWfs.bdwqy.cn
http://GOSnYDXn.bdwqy.cn
http://pHHg1pCp.bdwqy.cn
http://7I2vR0bc.bdwqy.cn
http://hLAXw1Nb.bdwqy.cn
http://MBmux0by.bdwqy.cn
http://GzwnK4NG.bdwqy.cn
http://ev3TSEvq.bdwqy.cn
http://www.dtcms.com/a/381341.html

相关文章:

  • Linux x86 stability和coredump
  • Claude-Flow AI协同开发:从“CTO”到“人机共生体”的AI协同开发
  • CPR_code
  • 【连接器专题】FPC连接器基础及连接器选型指南
  • 精准、可控、高一致性:谷歌Nano Banana正在终结AI“抽卡”时代
  • 操作系统实时性的影响因素总结
  • 国际避税方法有哪些
  • 开发避坑指南(47):IDEA 2025.1.3 运行main函数报错:CreateProcess error=206, 文件名或扩展名太长的解决方案
  • 《苍穹外卖》项目日记_Day9
  • 文件检查与拷贝-简化版
  • 电容式原理检测微小位移的技术方案以及芯片方案
  • 嵌入式系统内存分段核心内容详解
  • AI生成内容检测的综合方法论与技术路径
  • 材料基因组计划(MGI)入门:高通量计算与数据管理最佳实践
  • 系统地总结一下Python中关于“遍历”的知识点
  • Android面试指南(九)
  • Halcon编程指南:符号与元组操作详解
  • 嵌入式第五十二天(GIC,协处理器,异常向量表)
  • 嵌入式学习day48-硬件-imx6ul-key、中断
  • 查找算法和递推算法
  • Webman 微服务集成 RustFS 分布式对象存储
  • 基于51单片机的太阳能锂电池充电路灯
  • 【人工智能通识专栏】第十三讲:图像处理
  • 滚动分页查询-通俗解释
  • 电缆工程量计算-批量测量更轻松
  • UDS NRC速查
  • L2-【英音】地道语音语调--语调
  • 13.渗透-.Linux基础命令(五)-用户管理(修改用户密码)
  • 解决串口数据乱序问题
  • 智能化集成系统(IBMS):构建智慧建筑 “中枢大脑” 的全方案