当前位置：首页 > news >正文

GPU机器-显卡占用

news 2025/11/2 17:41:21

GPU机器-显卡占用

背景

不论在读研、实习还是正式的算法同学，经常会想跑训练&推理任务找不到卡，即使有卡还需要重新装环境，比较麻烦，所以许多经常使用公司内部云平台GPU开发机时候，会在跑完任务时候，跑一些无用的任务占用GPU，防止机器被kill

下面提供一个通用的gpus占用任务，方便大家占卡，但如果很久不用还是最好释放资源给别的同学实验哈。

代码

import torch
import torch.nn as nn
import argparse
import time
import threading
import os
from datetime import datetimeclass GPUOccupier:def __init__(self, gpu_id, memory_fraction=0.8, compute_fraction=0.7):"""GPU占用器Args:gpu_id: GPU设备IDmemory_fraction: 显存占用比例 (0-1)compute_fraction: 计算占用比例 (0-1)"""self.gpu_id = gpu_idself.device = torch.device(f'cuda:{gpu_id}')self.memory_fraction = memory_fractionself.compute_fraction = compute_fractionself.running = Falseself.memory_holder = Nonedef get_gpu_memory_info(self):"""获取GPU显存信息"""torch.cuda.set_device(self.device)total_memory = torch.cuda.get_device_properties(self.device).total_memoryallocated_memory = torch.cuda.memory_allocated(self.device)free_memory = total_memory - allocated_memoryreturn total_memory, allocated_memory, free_memorydef allocate_memory(self):"""分配指定比例的显存"""torch.cuda.set_device(self.device)total_memory, _, free_memory = self.get_gpu_memory_info()# 计算需要分配的显存大小target_memory = int(total_memory * self.memory_fraction)current_allocated = torch.cuda.memory_allocated(self.device)need_allocate = target_memory - current_allocatedif need_allocate > 0:# 每个float32占4字节num_elements = need_allocate // 4# 分配显存try:self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)print(f"GPU {self.gpu_id}: 成功分配 {need_allocate / 1024**3:.2f} GB 显存")except RuntimeError as e:print(f"GPU {self.gpu_id}: 显存分配失败: {e}")# 尝试分配较小的显存available = free_memory * 0.95  # 留5%余量num_elements = int(available // 4)self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)print(f"GPU {self.gpu_id}: 实际分配 {available / 1024**3:.2f} GB 显存")def compute_task(self):"""执行计算任务以占用GPU计算资源"""torch.cuda.set_device(self.device)# 创建一些矩阵用于计算size = 4096a = torch.randn(size, size, device=self.device, dtype=torch.float32)b = torch.randn(size, size, device=self.device, dtype=torch.float32)while self.running:start_time = time.time()# 执行矩阵运算c = torch.matmul(a, b)torch.cuda.synchronize(self.device)# 计算执行时间compute_time = time.time() - start_time# 根据compute_fraction调整休眠时间# 如果compute_fraction=0.7，则工作70%的时间，休息30%的时间if self.compute_fraction < 1.0:sleep_time = compute_time * (1 - self.compute_fraction) / self.compute_fractiontime.sleep(sleep_time)def start(self):"""启动GPU占用"""self.running = True# 分配显存self.allocate_memory()# 启动计算线程self.compute_thread = threading.Thread(target=self.compute_task)self.compute_thread.start()print(f"GPU {self.gpu_id}: 占用已启动 (显存: {self.memory_fraction*100}%, 计算: {self.compute_fraction*100}%)")def stop(self):"""停止GPU占用"""self.running = Falseif hasattr(self, 'compute_thread'):self.compute_thread.join()# 释放显存if self.memory_holder is not None:del self.memory_holdertorch.cuda.empty_cache()print(f"GPU {self.gpu_id}: 占用已停止")def get_status(self):"""获取GPU状态"""total_memory, allocated_memory, free_memory = self.get_gpu_memory_info()memory_usage = allocated_memory / total_memory * 100return {'gpu_id': self.gpu_id,'total_memory_gb': total_memory / 1024**3,'allocated_memory_gb': allocated_memory / 1024**3,'free_memory_gb': free_memory / 1024**3,'memory_usage_percent': memory_usage}def main():parser = argparse.ArgumentParser(description='GPU占用程序')parser.add_argument('--gpus', type=str, default='all', help='要占用的GPU编号，如 "0,1,2" 或 "all" 占用所有GPU')parser.add_argument('--memory', type=float, default=0.8, help='显存占用比例 (0-1), 默认0.8')parser.add_argument('--compute', type=float, default=0.7, help='计算占用比例 (0-1), 默认0.7')parser.add_argument('--duration', type=int, default=0, help='运行时长(秒), 0表示持续运行直到手动停止')args = parser.parse_args()# 确定要使用的GPUif args.gpus == 'all':num_gpus = torch.cuda.device_count()gpu_ids = list(range(num_gpus))else:gpu_ids = [int(x.strip()) for x in args.gpus.split(',')]if not gpu_ids:print("错误: 没有可用的GPU")returnprint(f"检测到 {torch.cuda.device_count()} 个GPU")print(f"将占用GPU: {gpu_ids}")print(f"显存占用目标: {args.memory * 100}%")print(f"计算占用目标: {args.compute * 100}%")print("-" * 50)# 创建GPU占用器occupiers = []for gpu_id in gpu_ids:if gpu_id >= torch.cuda.device_count():print(f"警告: GPU {gpu_id} 不存在，跳过")continueoccupier = GPUOccupier(gpu_id, args.memory, args.compute)occupiers.append(occupier)# 启动所有占用器for occupier in occupiers:occupier.start()print("\n占用已启动，按 Ctrl+C 停止程序")print("-" * 50)try:start_time = time.time()while True:time.sleep(10)  # 每10秒打印一次状态print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] GPU状态:")for occupier in occupiers:status = occupier.get_status()print(f"  GPU {status['gpu_id']}: "f"显存使用 {status['allocated_memory_gb']:.2f}/{status['total_memory_gb']:.2f} GB "f"({status['memory_usage_percent']:.1f}%)")# 检查是否到达指定运行时长if args.duration > 0 and time.time() - start_time > args.duration:print(f"\n已运行 {args.duration} 秒，正在停止...")breakexcept KeyboardInterrupt:print("\n\n接收到停止信号，正在清理...")# 停止所有占用器for occupier in occupiers:occupier.stop()print("\n程序已退出")if __name__ == "__main__":main()

执行命令

vi gpus_occupy.py
python gpus_occupy.py --gpus all --memory 0.7 --compute 0.7

查看全文

http://www.dtcms.com/a/553765.html

网站关键词抓取id wordpress

学校网站建设措施西地那非片能延时多久每次吃多少

Adobe Lightroom Classic 2026 v15.0 更新详解：AI加持下的全新摄影工作流

蚂蚁S19j XP 117T矿机技术分析：适合BTC与BCH挖矿的高效选择

单元测试、集成测试和系统测试的联系和区别是什么？

做旅行社的都是在哪网站拿票办一个网站要多少钱

青岛网站推广方案查看网站是否做百度推广

IDEA多实例项目启动模拟负载均衡

maven进阶了解

Android 嵌入h5顶部状态栏空白

网页制作可以用手机吗江门网站排名优化

营销型网站核心要素有哪些网站建设费税率是多少钱

IDEA的安装与设置

Fuzzing 工具来一波

10.31

网站后台更新前台不显示金蝶财务软件

Spring Boot项目的核心依赖

ollama本地化部署deepseek/大模型及其api流式调用

向华为学习——53页华为制造行业数字化转型工业互联网智能制造解决方案【附全文阅读】

基于电鱼 ARM 工控机的煤矿主控系统高可靠运行方案——让井下控制系统告别“死机与重启”

顶尖网站建设国内大宗商品交易平台有哪些

用langchain搭建简单agent

在 Windows 11 中安装 VirtualBox 7.2.4

【开题答辩全过程】以基于Java的社交健身系统的设计与实现为例，包含答辩的问题和答案

Ubuntu20.04升级autoconf

网站名注册最佳商城ui网站设计

R包kuenm和ENMeval--你用对了吗？

接口自动化测试项目框架详解

临安网站建设海口网红

10.string(上)

GPU机器-显卡占用

背景

代码

相关文章：