GPU机器-显卡占用
GPU机器-显卡占用
背景
不论在读研、实习还是正式的算法同学,经常会想跑训练&推理任务找不到卡,即使有卡还需要重新装环境,比较麻烦,所以许多经常使用公司内部云平台GPU开发机时候,会在跑完任务时候,跑一些无用的任务占用GPU,防止机器被kill
下面提供一个通用的gpus占用任务,方便大家占卡,但如果很久不用还是最好释放资源给别的同学实验哈。
代码
import torch
import torch.nn as nn
import argparse
import time
import threading
import os
from datetime import datetimeclass GPUOccupier:def __init__(self, gpu_id, memory_fraction=0.8, compute_fraction=0.7):"""GPU占用器Args:gpu_id: GPU设备IDmemory_fraction: 显存占用比例 (0-1)compute_fraction: 计算占用比例 (0-1)"""self.gpu_id = gpu_idself.device = torch.device(f'cuda:{gpu_id}')self.memory_fraction = memory_fractionself.compute_fraction = compute_fractionself.running = Falseself.memory_holder = Nonedef get_gpu_memory_info(self):"""获取GPU显存信息"""torch.cuda.set_device(self.device)total_memory = torch.cuda.get_device_properties(self.device).total_memoryallocated_memory = torch.cuda.memory_allocated(self.device)free_memory = total_memory - allocated_memoryreturn total_memory, allocated_memory, free_memorydef allocate_memory(self):"""分配指定比例的显存"""torch.cuda.set_device(self.device)total_memory, _, free_memory = self.get_gpu_memory_info()# 计算需要分配的显存大小target_memory = int(total_memory * self.memory_fraction)current_allocated = torch.cuda.memory_allocated(self.device)need_allocate = target_memory - current_allocatedif need_allocate > 0:# 每个float32占4字节num_elements = need_allocate // 4# 分配显存try:self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)print(f"GPU {self.gpu_id}: 成功分配 {need_allocate / 1024**3:.2f} GB 显存")except RuntimeError as e:print(f"GPU {self.gpu_id}: 显存分配失败: {e}")# 尝试分配较小的显存available = free_memory * 0.95 # 留5%余量num_elements = int(available // 4)self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)print(f"GPU {self.gpu_id}: 实际分配 {available / 1024**3:.2f} GB 显存")def compute_task(self):"""执行计算任务以占用GPU计算资源"""torch.cuda.set_device(self.device)# 创建一些矩阵用于计算size = 4096a = torch.randn(size, size, device=self.device, dtype=torch.float32)b = torch.randn(size, size, device=self.device, dtype=torch.float32)while self.running:start_time = time.time()# 执行矩阵运算c = torch.matmul(a, b)torch.cuda.synchronize(self.device)# 计算执行时间compute_time = time.time() - start_time# 根据compute_fraction调整休眠时间# 如果compute_fraction=0.7,则工作70%的时间,休息30%的时间if self.compute_fraction < 1.0:sleep_time = compute_time * (1 - self.compute_fraction) / self.compute_fractiontime.sleep(sleep_time)def start(self):"""启动GPU占用"""self.running = True# 分配显存self.allocate_memory()# 启动计算线程self.compute_thread = threading.Thread(target=self.compute_task)self.compute_thread.start()print(f"GPU {self.gpu_id}: 占用已启动 (显存: {self.memory_fraction*100}%, 计算: {self.compute_fraction*100}%)")def stop(self):"""停止GPU占用"""self.running = Falseif hasattr(self, 'compute_thread'):self.compute_thread.join()# 释放显存if self.memory_holder is not None:del self.memory_holdertorch.cuda.empty_cache()print(f"GPU {self.gpu_id}: 占用已停止")def get_status(self):"""获取GPU状态"""total_memory, allocated_memory, free_memory = self.get_gpu_memory_info()memory_usage = allocated_memory / total_memory * 100return {'gpu_id': self.gpu_id,'total_memory_gb': total_memory / 1024**3,'allocated_memory_gb': allocated_memory / 1024**3,'free_memory_gb': free_memory / 1024**3,'memory_usage_percent': memory_usage}def main():parser = argparse.ArgumentParser(description='GPU占用程序')parser.add_argument('--gpus', type=str, default='all', help='要占用的GPU编号,如 "0,1,2" 或 "all" 占用所有GPU')parser.add_argument('--memory', type=float, default=0.8, help='显存占用比例 (0-1), 默认0.8')parser.add_argument('--compute', type=float, default=0.7, help='计算占用比例 (0-1), 默认0.7')parser.add_argument('--duration', type=int, default=0, help='运行时长(秒), 0表示持续运行直到手动停止')args = parser.parse_args()# 确定要使用的GPUif args.gpus == 'all':num_gpus = torch.cuda.device_count()gpu_ids = list(range(num_gpus))else:gpu_ids = [int(x.strip()) for x in args.gpus.split(',')]if not gpu_ids:print("错误: 没有可用的GPU")returnprint(f"检测到 {torch.cuda.device_count()} 个GPU")print(f"将占用GPU: {gpu_ids}")print(f"显存占用目标: {args.memory * 100}%")print(f"计算占用目标: {args.compute * 100}%")print("-" * 50)# 创建GPU占用器occupiers = []for gpu_id in gpu_ids:if gpu_id >= torch.cuda.device_count():print(f"警告: GPU {gpu_id} 不存在,跳过")continueoccupier = GPUOccupier(gpu_id, args.memory, args.compute)occupiers.append(occupier)# 启动所有占用器for occupier in occupiers:occupier.start()print("\n占用已启动,按 Ctrl+C 停止程序")print("-" * 50)try:start_time = time.time()while True:time.sleep(10) # 每10秒打印一次状态print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] GPU状态:")for occupier in occupiers:status = occupier.get_status()print(f" GPU {status['gpu_id']}: "f"显存使用 {status['allocated_memory_gb']:.2f}/{status['total_memory_gb']:.2f} GB "f"({status['memory_usage_percent']:.1f}%)")# 检查是否到达指定运行时长if args.duration > 0 and time.time() - start_time > args.duration:print(f"\n已运行 {args.duration} 秒,正在停止...")breakexcept KeyboardInterrupt:print("\n\n接收到停止信号,正在清理...")# 停止所有占用器for occupier in occupiers:occupier.stop()print("\n程序已退出")if __name__ == "__main__":main()
执行命令
vi gpus_occupy.py
python gpus_occupy.py --gpus all --memory 0.7 --compute 0.7
