GPU集群poc测试
一、自动化测试脚本(整合核心测试项)
脚本名称:gpu_cluster_delivery_test.sh
bash
#!/bin/bash
set -euo pipefail# ==================== 配置参数(根据实际需求修改)====================
TEST_DIR="/tmp/gpu_test" # 测试临时目录
LOG_FILE="${TEST_DIR}/gpu_delivery_test.log" # 测试日志文件
STRESS_DURATION=3600 # 单节点压力测试时长(秒,默认1小时,可改为86400=24小时)
GPU_BURN_PATH="${TEST_DIR}/gpu_burn" # GPU满负载测试工具路径
PYTHON_SCRIPT="${TEST_DIR}/memory_bandwidth_test.py" # 显存带宽测试Python脚本
NETWORK_TARGET_IP="192.168.1.101" # 网络测试目标节点IP(跨节点测试时修改)
STORAGE_TEST_SIZE="100G" # 存储测试数据量(默认100G,根据硬盘容量调整)# ==================== 初始化环境 ====================
init_env() {echo "===== 初始化测试环境 =====" | tee -a ${LOG_FILE}mkdir -p ${TEST_DIR}touch ${LOG_FILE}chmod 777 ${TEST_DIR}# 安装依赖工具echo "安装依赖包..." | tee -a ${LOG_FILE}apt update -y && apt install -y stress-ng fio iperf3 python3 python3-pip gcc git >> ${LOG_FILE} 2>&1pip3 install torch numpy pandas --upgrade >> ${LOG_FILE} 2>&1# 下载GPU Burn工具(GPU满负载测试)if [ ! -f ${GPU_BURN_PATH} ]; thenecho "下载GPU Burn工具..." | tee -a ${LOG_FILE}git clone https://github.com/wilicc/gpu-burn.git ${TEST_DIR}/gpu-burn-src >> ${LOG_FILE} 2>&1cd ${TEST_DIR}/gpu-burn-src && make >> ${LOG_FILE} 2>&1cp ${TEST_DIR}/gpu-burn-src/gpu_burn ${GPU_BURN_PATH}fi# 生成显存带宽测试Python脚本cat > ${PYTHON_SCRIPT} << 'EOF'
import torch
import time
import numpy as npdef test_memory_bandwidth(device_id=0):device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")if not torch.cuda.is_available():print("GPU not found!")return None, None# 测试参数:张量大小(16GB,根据显存调整)tensor_size = 1024 * 1024 * 1024 * 4 # 4GB per tensor (float32=4byte)if torch.cuda.get_device_properties(device).total_memory < tensor_size * 2:tensor_size = 1024 * 1024 * 512 * 4 # 2GB per tensor# 生成随机张量a = torch.randn(tensor_size, device=device, dtype=torch.float32)b = torch.randn(tensor_size, device=device, dtype=torch.float32)# 预热for _ in range(10):c = a * b# 测试写入带宽start_time = time.time()for _ in range(100):b.copy_(a)torch.cuda.synchronize()write_time = time.time() - start_timewrite_bandwidth = (tensor_size * 4 * 100) / (write_time * 1024 * 1024 * 1024) # GB/s# 测试计算+读写带宽(MatMul)start_time = time.time()for _ in range(50):c = torch.matmul(a.view(1024, -1), b.view(-1, 1024))torch.cuda.synchronize()matmul_time = time.time() - start_timematmul_bandwidth = (tensor_size * 4 * 2 * 50) / (matmul_time * 1024 * 1024 * 1024) # GB/sreturn write_bandwidth, matmul_bandwidthif __name__ == "__main__":num_gpus = torch.cuda.device_count()print(f"Found {num_gpus} GPU(s)")for gpu_id in range(num_gpus):write_bw, matmul_bw = test_memory_bandwidth(gpu_id)print(f"GPU {gpu_id}: Write Bandwidth = {write_bw:.2f} GB/s, MatMul Bandwidth = {matmul_bw:.2f} GB/s")
EOFecho "环境初始化完成!" | tee -a ${LOG_FILE}
}# ==================== 1. 硬件基础验证 ====================
hardware_check() {echo -e "\n===== 1. 硬件基础验证 =====" | tee -a ${LOG_FILE}# 系统信息echo "=== 系统信息 ===" | tee -a ${LOG_FILE}uname -a >> ${LOG_FILE}lsb_release -a >> ${LOG_FILE} 2>&1# CPU信息echo "=== CPU信息 ===" | tee -a ${LOG_FILE}lscpu | grep -E "Model name|CPU(s):|Thread(s) per core" >> ${LOG_FILE}# 内存信息echo "=== 内存信息 ===" | tee -a ${LOG_FILE}free -h >> ${LOG_FILE}dmidecode -t memory | grep -E "Size:|Speed:" >> ${LOG_FILE} 2>&1# GPU信息(nvidia-smi)echo "=== GPU信息 ===" | tee -a ${LOG_FILE}if command -v nvidia-smi &> /dev/null; thennvidia-smi >> ${LOG_FILE}nvidia-smi --query-gpu=name,memory.total,temperature.gpu,power.draw --format=csv,noheader,nounits >> ${LOG_FILE}elseecho "ERROR: nvidia-smi not found! GPU may not be recognized." | tee -a ${LOG_FILE}exit 1fi# 存储信息echo "=== 存储信息 ===" | tee -a ${LOG_FILE}lsblk >> ${LOG_FILE}df -h >> ${LOG_FILE}# 网卡信息echo "=== 网卡信息 ===" | tee -a ${LOG_FILE}ip addr >> ${LOG_FILE}lshw -class network | grep -E "description:|product:|speed:" >> ${LOG_FILE} 2>&1echo "硬件基础验证完成!" | tee -a ${LOG_FILE}
}# ==================== 2. GPU核心性能测试 ====================
gpu_performance_test() {echo -e "\n===== 2. GPU核心性能测试 =====" | tee -a ${LOG_FILE}# 2.1 单卡算力测试(nvcc编译简单MatMul)echo "=== 单卡算力测试(FP32 MatMul) ===" | tee -a ${LOG_FILE}cat > ${TEST_DIR}/matmul_test.cu << 'EOF'
#include <stdio.h>
#include <cuda_runtime.h>__global__ void matmul(float *a, float *b, float *c, int n) {int i = blockIdx.y * blockDim.y + threadIdx.y;int j = blockIdx.x * blockDim.x + threadIdx.x;float sum = 0.0f;for (int k = 0; k < n; k++) {sum += a[i * n + k] * b[k * n + j];}c[i * n + j] = sum;
}int main() {int n = 2048;size_t size = n * n * sizeof(float);float *h_a, *h_b, *h_c;float *d_a, *d_b, *d_c;cudaMallocHost(&h_a, size);cudaMallocHost(&h_b, size);cudaMallocHost(&h_c, size);cudaMalloc(&d_a, size);cudaMalloc(&d_b, size);cudaMalloc(&d_c, size);for (int i = 0; i < n*n; i++) {h_a[i] = rand() / (float)RAND_MAX;h_b[i] = rand() / (float)RAND_MAX;}cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);dim3 block(32, 32);dim3 grid(n/block.x, n/block.y);cudaEvent_t start, stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start);matmul<<<grid, block>>>(d_a, d_b, d_c, n);cudaEventRecord(stop);cudaEventSynchronize(stop);float ms;cudaEventElapsedTime(&ms, start, stop);float flops = 2.0f * n * n * n / (ms * 1e6); // TFLOPSprintf("FP32 MatMul (2048x2048): Time = %.2f ms, TFLOPS = %.2f\n", ms, flops);cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);cudaFreeHost(h_a); cudaFreeHost(h_b); cudaFreeHost(h_c);cudaEventDestroy(start); cudaEventDestroy(stop);return 0;
}
EOFif command -v nvcc &> /dev/null; thennvcc ${TEST_DIR}/matmul_test.cu -o ${TEST_DIR}/matmul_test >> ${LOG_FILE} 2>&1${TEST_DIR}/matmul_test >> ${LOG_FILE}elseecho "WARNING: nvcc not found, skip MatMul算力测试" | tee -a ${LOG_FILE}fi# 2.2 显存带宽测试(Python脚本)echo "=== 显存带宽测试 ===" | tee -a ${LOG_FILE}python3 ${PYTHON_SCRIPT} >> ${LOG_FILE}# 2.3 多卡NCCL通信测试(需安装NCCL)echo "=== 多卡NCCL通信测试 ===" | tee -a ${LOG_FILE}if command -v nccl-tests/build/all_reduce_perf &> /dev/null; thennccl-tests/build/all_reduce_perf -b 8MB -e 16GB -f 2 >> ${LOG_FILE}elseecho "WARNING: NCCL tests not found, skip多卡通信测试(需安装nccl-tests)" | tee -a ${LOG_FILE}fiecho "GPU核心性能测试完成!" | tee -a ${LOG_FILE}
}# ==================== 3. 网络互联测试 ====================
network_test() {echo -e "\n===== 3. 网络互联测试 =====" | tee -a ${LOG_FILE}# 3.1 本机网卡带宽(iperf3自测试)echo "=== 本机网卡带宽测试(iperf3) ===" | tee -a ${LOG_FILE}iperf3 -s -D --logfile ${TEST_DIR}/iperf_server.logsleep 2iperf3 -c localhost -i 1 -t 30 -P 8 >> ${LOG_FILE}pkill iperf3# 3.2 跨节点网络测试(需配置TARGET_IP)if [ -n "${NETWORK_TARGET_IP}" ] && ping -c 1 ${NETWORK_TARGET_IP} &> /dev/null; thenecho "=== 跨节点网络测试(目标IP: ${NETWORK_TARGET_IP}) ===" | tee -a ${LOG_FILE}iperf3 -c ${NETWORK_TARGET_IP} -i 1 -t 30 -P 8 >> ${LOG_FILE}elseecho "WARNING: 目标节点不可达,跳过跨节点网络测试" | tee -a ${LOG_FILE}fiecho "网络互联测试完成!" | tee -a ${LOG_FILE}
}# ==================== 4. 稳定性压力测试 ====================
stress_test() {echo -e "\n===== 4. 稳定性压力测试(持续${STRESS_DURATION}秒) =====" | tee -a ${LOG_FILE}# 4.1 GPU满负载测试(gpu-burn)echo "=== GPU满负载测试 ===" | tee -a ${LOG_FILE}${GPU_BURN_PATH} ${STRESS_DURATION} >> ${LOG_FILE} 2>&1 &GPU_BURN_PID=$!# 4.2 CPU+内存压力测试(stress-ng)echo "=== CPU+内存压力测试 ===" | tee -a ${LOG_FILE}stress-ng --cpu $(nproc) --memory 80% --vm 4 --timeout ${STRESS_DURATION} >> ${LOG_FILE} 2>&1 &STRESS_NG_PID=$!# 4.3 存储压力测试(fio)echo "=== 存储压力测试(随机读写) ===" | tee -a ${LOG_FILE}fio --name=storage_stress --rw=randrw --bs=64k --size=${STORAGE_TEST_SIZE} --numjobs=8 --runtime=${STRESS_DURATION} --iodepth=32 --direct=1 --group_reporting >> ${LOG_FILE} 2>&1 &FIO_PID=$!# 等待所有压力测试完成wait ${GPU_BURN_PID} ${STRESS_NG_PID} ${FIO_PID}echo "稳定性压力测试完成!" | tee -a ${LOG_FILE}
}# ==================== 5. 软件兼容性测试 ====================
software_compatibility_test() {echo -e "\n===== 5. 软件兼容性测试 =====" | tee -a ${LOG_FILE}# 5.1 PyTorch GPU可用性测试echo "=== PyTorch GPU可用性测试 ===" | tee -a ${LOG_FILE}python3 -c "import torch; print(f'PyTorch version: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'GPU count: {torch.cuda.device_count()}')" >> ${LOG_FILE}# 5.2 简单模型训练测试(MNIST)echo "=== 简单模型训练测试(MNIST) ===" | tee -a ${LOG_FILE}python3 - << 'EOF'
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader# 简单CNN模型
class SimpleCNN(nn.Module):def __init__(self):super(SimpleCNN, self).__init__()self.conv1 = nn.Conv2d(1, 32, 3, 1)self.conv2 = nn.Conv2d(32, 64, 3, 1)self.dropout1 = nn.Dropout(0.25)self.dropout2 = nn.Dropout(0.5)self.fc1 = nn.Linear(9216, 128)self.fc2 = nn.Linear(128, 10)def forward(self, x):x = self.conv1(x)x = nn.functional.relu(x)x = self.conv2(x)x = nn.functional.relu(x)x = nn.functional.max_pool2d(x, 2)x = self.dropout1(x)x = torch.flatten(x, 1)x = self.fc1(x)x = nn.functional.relu(x)x = self.dropout2(x)x = self.fc2(x)output = nn.functional.log_softmax(x, dim=1)return output# 训练配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('../data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)model = SimpleCNN().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=1.0)
criterion = nn.NLLLoss()# 训练1个epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):data, target = data.to(device), target.to(device)optimizer.zero_grad()output = model(data)loss = criterion(output, target)loss.backward()optimizer.step()if batch_idx % 100 == 0:print(f'Train Epoch: 1 [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
print("PyTorch MNIST训练测试成功!")
EOF >> ${LOG_FILE} 2>&1echo "软件兼容性测试完成!" | tee -a ${LOG_FILE}
}# ==================== 主函数执行 ====================
main() {echo "===== GPU服务器集群交付测试开始($(date))=====" | tee -a ${LOG_FILE}init_envhardware_checkgpu_performance_testnetwork_teststress_testsoftware_compatibility_testecho -e "\n===== 测试结束($(date))=====" | tee -a ${LOG_FILE}echo "测试日志已保存至:${LOG_FILE}" | tee -a ${LOG_FILE}
}# 启动测试
main
二、脚本使用说明
1. 前置条件
- 操作系统:Ubuntu 20.04/22.04(脚本基于 Debian 系,CentOS 需修改
apt为yum)。 - 已安装 GPU 驱动(NVIDIA Driver ≥ 525)、CUDA Toolkit(建议 12.0+)。
- 服务器联网(需下载依赖包和工具)。
- 权限:以
root用户执行(避免权限不足)。
2. 脚本修改(关键配置)
根据实际集群情况修改脚本头部的 配置参数:
bash
NETWORK_TARGET_IP="192.168.1.101" # 跨节点网络测试的目标节点IP(必填,否则跳过跨节点测试)
STRESS_DURATION=3600 # 压力测试时长(默认1小时,交付测试建议改为86400=24小时)
STORAGE_TEST_SIZE="100G" # 存储测试数据量(根据硬盘剩余空间调整,避免空间不足)
3. 执行步骤
(1)单节点测试
bash
# 1. 下载脚本(或手动创建文件粘贴内容)
wget https://xxx/gpu_cluster_delivery_test.sh # 替换为实际脚本路径,或手动创建
chmod +x gpu_cluster_delivery_test.sh# 2. 执行脚本(后台运行,避免断开连接中断测试)
nohup ./gpu_cluster_delivery_test.sh &# 3. 查看测试进度
tail -f /tmp/gpu_test/gpu_delivery_test.log
(2)集群批量测试(结合 Ansible)
若需对集群所有节点执行测试,可通过 Ansible 批量分发脚本并执行:
bash
# 1. 编辑Ansible主机清单(inventory.ini)
[gpu_cluster]
node1 ansible_host=192.168.1.100
node2 ansible_host=192.168.1.101
node3 ansible_host=192.168.1.102# 2. 批量分发脚本
ansible gpu_cluster -m copy -a "src=./gpu_cluster_delivery_test.sh dest=/root/ mode=755" -i inventory.ini# 3. 批量执行脚本
ansible gpu_cluster -m shell -a "nohup /root/gpu_cluster_delivery_test.sh &" -i inventory.ini# 4. 批量获取测试日志
ansible gpu_cluster -m fetch -a "src=/tmp/gpu_test/gpu_delivery_test.log dest=./cluster_test_logs/" -i inventory.ini
三、脚本覆盖范围与补充说明
1. 已覆盖的测试维度
| 测试维度 | 核心测试项 | 工具 / 方法 |
|---|---|---|
| 硬件基础验证 | CPU / 内存 / GPU / 存储 / 网卡识别与配置核对 | lscpu、nvidia-smi、lsblk |
| GPU 性能 | 单卡算力(MatMul)、显存带宽、多卡 NCCL 通信 | nvcc、PyTorch、nccl-tests |
| 网络互联 | 本机带宽、跨节点带宽(iperf3) | iperf3 |
| 稳定性 | GPU/CPU/ 内存 / 存储满负载压力测试 | gpu-burn、stress-ng、fio |
| 软件兼容性 | PyTorch 可用性、简单模型训练 | PyTorch、MNIST 数据集 |
2. 未覆盖的测试项(需手动补充)
- 集群分布式训练测试(如跨节点 8 卡微调 LLaMA):需结合用户实际业务场景,脚本仅覆盖单节点多卡测试。
- InfiniBand 网络测试:若集群用 IB 网卡,需替换
iperf3为ib_write_bw/ib_write_lat(脚本默认支持以太网)。 - K8s/GPU Operator 适配测试:需手动部署 K8s 和 GPU Operator,执行
kubectl describe nodes验证 GPU 调度。 - MLPerf 基准测试:若需严格对标行业标准,需单独运行 MLPerf 脚本(需提前下载数据集和配置文件)。
3. 测试结果判断标准
脚本执行完成后,查看/tmp/gpu_test/gpu_delivery_test.log,重点关注:
- 无
ERROR日志,WARNING可根据实际场景忽略(如未安装 NCCL 则跳过多卡测试)。 - GPU 温度≤90℃(压力测试期间),无自动降频或停机。
- 显存带宽 / 算力接近官方标称值(误差≤10%)。
- 跨节点网络带宽≥标称值的 90%(如 100G 以太网≥90 Gbps)。
- 稳定性测试期间无进程崩溃、节点离线。
四、扩展建议
- 定制化修改:根据用户业务场景添加测试项(如 AI 绘画 Stable Diffusion 测试、大模型推理延迟测试)。
- 报告自动化:在脚本末尾添加日志解析逻辑,生成 HTML 格式测试报告(需安装
pandas、jinja2)。 - 故障告警:整合
email或钉钉机器人,测试失败时自动发送告警。
