当前位置：首页 > news >正文

基于python跨平台硬件诊断的工具

news 2025/7/19 8:18:30

基于python使用DiamondDiagnoser等，对系统信息进行的一个收集，NVIDIA GPU的详细状态，包括温度、利用率、显存使用情况等。

"""
DIAMOND - Deep Insight And Monitoring of Accelerated Neural Devices
跨平台硬件诊断工具 v2.1
"""
import os
import sys
import json
import time
import platform
import argparse
import subprocess
from datetime import datetime
from typing import Dict, List, Optional, Tuple

try:
    import torch
    import chardet
    from colorama import Fore, Style, init
except ImportError as e:
    print(f"缺少依赖库: {e}")
    sys.exit(1)

# 初始化颜色输出
init(autoreset=True)


class DiamondDiagnoser:
    def __init__(self, output_format: str = "text"):
        self.output_format = output_format
        self.report = {
            "meta": {
                "tool": "DIAMOND",
                "version": "2.1",
                "generated_at": datetime.utcnow().isoformat() + "Z"
            },
            "system": {},
            "gpu": {},
            "errors": []
        }

    def _log_error(self, context: str, error: Exception):
        """统一错误日志记录"""
        error_entry = {
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "context": context,
            "type": type(error).__name__,
            "message": str(error)
        }
        self.report["errors"].append(error_entry)

    def _run_cmd(self, cmd: List[str], timeout: int = 15) -> Tuple[bool, str]:
        """安全执行系统命令"""
        try:
            result = subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                timeout=timeout,
                check=True
            )

            # 智能编码检测
            encoding = chardet.detect(result.stdout)['encoding'] or 'utf-8'
            return True, result.stdout.decode(encoding).strip()
        except subprocess.CalledProcessError as e:
            return False, f"Command failed (code {e.returncode}): {e.output.decode(errors='replace')}"
        except Exception as e:
            self._log_error(f"执行命令 {' '.join(cmd)}", e)
            return False, str(e)

    def _get_platform_info(self):
        """获取系统级信息"""
        try:
            self.report["system"] = {
                "platform": {
                    "system": platform.system(),
                    "release": platform.release(),
                    "version": platform.version(),
                    "machine": platform.machine()
                },
                "python": {
                    "version": platform.python_version(),
                    "implementation": platform.python_implementation(),
                    "compiler": platform.python_compiler()
                },
                "environment": {
                    "CUDA_HOME": os.getenv("CUDA_HOME"),
                    "CUDA_PATH": os.getenv("CUDA_PATH"),
                    "CUDA_VISIBLE_DEVICES": os.getenv("CUDA_VISIBLE_DEVICES")
                }
            }
        except Exception as e:
            self._log_error("收集系统信息", e)

    def _get_pytorch_info(self):
        """获取PyTorch深度信息"""
        try:
            torch_info = {
                "version": torch.__version__,
                "cuda_available": torch.cuda.is_available(),
                "devices": []
            }

            if torch_info["cuda_available"]:
                for i in range(torch.cuda.device_count()):
                    prop = torch.cuda.get_device_properties(i)
                    torch_info["devices"].append({
                        "name": prop.name,
                        "capability": f"{prop.major}.{prop.minor}",
                        "total_memory": prop.total_memory,
                        "multiprocessors": prop.multi_processor_count
                    })

                    # 实时内存状态
                    torch_info["memory"] = {
                        "allocated": torch.cuda.memory_allocated(i),
                        "reserved": torch.cuda.memory_reserved(i),
                        "cached": torch.cuda.memory_reserved(i) - torch.cuda.memory_allocated(i)
                    }

            self.report["gpu"]["pytorch"] = torch_info
        except Exception as e:
            self._log_error("获取PyTorch信息", e)

    def _get_nvidia_smi(self):
        """深度解析NVIDIA-SMI信息"""
        try:
            success, output = self._run_cmd(["nvidia-smi",
                                             "--query-gpu=index,name,pci.bus_id,driver_version,temperature.gpu,utilization.gpu,memory.total,memory.used",
                                             "--format=csv,noheader,nounits"])

            gpu_list = []
            if success:
                for line in output.split('\n'):
                    fields = [f.strip() for f in line.split(', ')]
                    if len(fields) == 8:
                        gpu_list.append({
                            "index": fields[0],
                            "name": fields[1],
                            "pci_bus": fields[2],
                            "driver": fields[3],
                            "temperature": f"{fields[4]}°C",
                            "utilization": f"{fields[5]}%",
                            "memory": {
                                "total": f"{int(fields[6]) / 1024:.1f} GB",
                                "used": f"{int(fields[7]) / 1024:.1f} GB",
                                "free": f"{(int(fields[6]) - int(fields[7])) / 1024:.1f} GB"
                            }
                        })

            self.report["gpu"]["nvidia"] = {
                "detected": len(gpu_list) > 0,
                "count": len(gpu_list),
                "details": gpu_list
            }
        except Exception as e:
            self._log_error("解析NVIDIA-SMI", e)

    def _get_advanced_checks(self):
        """执行深度硬件检查"""
        # PCI设备检查
        try:
            if platform.system() == "Linux":
                success, lspci = self._run_cmd(["lspci", "-nnk"])
                if success:
                    self.report["hardware"] = {
                        "pci_devices": [line.strip() for line in lspci.split('\n') if "VGA" in line or "3D" in line]
                    }
        except Exception as e:
            self._log_error("PCI设备检查", e)

        # 驱动兼容性检查
        try:
            if torch.cuda.is_available() and self.report["gpu"]["nvidia"]["detected"]:
                cuda_version = torch.version.cuda
                success, driver = self._run_cmd(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"])
                if success:
                    self.report["compatibility"] = {
                        "cuda_version": cuda_version,
                        "driver_version": driver.strip(),
                        "compatible": self._check_driver_compatibility(cuda_version, driver)
                    }
        except Exception as e:
            self._log_error("驱动兼容性检查", e)

    def _check_driver_compatibility(self, cuda_ver: str, driver_ver: str) -> bool:
        """验证驱动与CUDA版本兼容性"""
        try:
            # 转换版本号为数字
            cuda_major = int(cuda_ver.split('.')[0])
            driver_major = int(driver_ver.split('.')[0])

            # CUDA与驱动版本兼容规则
            return driver_major >= {
                12: 525, 11: 450, 10: 410
            }.get(cuda_major, 0)
        except:
            return False

    def generate_report(self):
        """生成完整诊断报告"""
        self._get_platform_info()
        self._get_pytorch_info()
        self._get_nvidia_smi()
        self._get_advanced_checks()
        return self.report

    def _format_memory(self, bytes_val: int) -> str:
        """智能内存格式化"""
        for unit in ['B', 'KiB', 'MiB', 'GiB']:
            if bytes_val < 1024:
                return f"{bytes_val:.2f} {unit}"
            bytes_val /= 1024
        return f"{bytes_val:.2f} TiB"

    def print_report(self):
        """专业级控制台输出"""
        if self.output_format == "json":
            print(json.dumps(self.report, indent=2))
            return

        # 彩色控制台输出
        print(f"\n{Fore.BLUE}=== DIAMOND 硬件诊断报告 ==={Style.RESET_ALL}")
        print(f"{Fore.CYAN}生成时间:{Style.RESET_ALL} {self.report['meta']['generated_at']}")

        # 系统信息
        print(f"\n{Fore.YELLOW}◆ 系统概览{Style.RESET_ALL}")
        sys_info = self.report['system']['platform']
        print(f"操作系统: {sys_info['system']} {sys_info['release']} ({sys_info['machine']})")
        print(
            f"Python环境: {self.report['system']['python']['version']} ({self.report['system']['python']['compiler']})")

        # GPU信息
        print(f"\n{Fore.YELLOW}◆ 加速器状态{Style.RESET_ALL}")
        if self.report["gpu"]["nvidia"]["detected"]:
            for gpu in self.report["gpu"]["nvidia"]["details"]:
                print(f"{Fore.GREEN}GPU {gpu['index']}: {gpu['name']}{Style.RESET_ALL}")
                print(f"├─ PCI总线: {gpu['pci_bus']}")
                print(f"├─ 驱动版本: {gpu['driver']}")
                print(f"├─ 温度: {gpu['temperature']}")
                print(f"├─ 利用率: {gpu['utilization']}")
                print(f"└─ 显存: {gpu['memory']['used']} / {gpu['memory']['total']} (剩余 {gpu['memory']['free']})")
        else:
            print(f"{Fore.RED}× 未检测到NVIDIA GPU{Style.RESET_ALL}")

        # PyTorch信息
        print(f"\n{Fore.YELLOW}◆ PyTorch 运行时{Style.RESET_ALL}")
        torch_info = self.report["gpu"]["pytorch"]
        if torch_info["cuda_available"]:
            print(f"{Fore.GREEN}✓ CUDA 可用 ({torch.version.cuda}){Style.RESET_ALL}")
            for idx, device in enumerate(torch_info["devices"]):
                print(f"设备 {idx}: {device['name']}")
                print(f"├─ 计算能力: {device['capability']}")
                print(f"├─ 流处理器: {device['multiprocessors']}")
                print(f"└─ 总显存: {self._format_memory(device['total_memory'])}")
        else:
            print(f"{Fore.RED}× PyTorch CUDA 不可用{Style.RESET_ALL}")

        # 兼容性检查
        if "compatibility" in self.report:
            print(f"\n{Fore.YELLOW}◆ 兼容性验证{Style.RESET_ALL}")
            compat = self.report["compatibility"]
            status = f"{Fore.GREEN}✓ 兼容" if compat["compatible"] else f"{Fore.RED}× 不兼容"
            print(f"{status}{Style.RESET_ALL}")
            print(f"CUDA版本: {compat['cuda_version']}")
            print(f"驱动版本: {compat['driver_version']}")

        # 错误报告
        if self.report["errors"]:
            print(f"\n{Fore.RED}◆ 错误日志 ({len(self.report['errors'])}){Style.RESET_ALL}")
            for error in self.report["errors"]:
                print(f"[{error['timestamp']}] {error['context']}: {error['message']}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="DIAMOND - 深度硬件诊断工具")
    parser.add_argument("-f", "--format", choices=["text", "json"], default="text", help="输出格式")
    parser.add_argument("-o", "--output", help="保存报告到文件")
    args = parser.parse_args()

    diagnoser = DiamondDiagnoser(output_format=args.format)
    report = diagnoser.generate_report()

    if args.output:
        try:
            with open(args.output, 'w') as f:
                if args.format == "json":
                    json.dump(report, f, indent=2)
                else:
                    diagnoser.print_report()
                    f.write(str(report))
            print(f"{Fore.GREEN}报告已保存至: {args.output}{Style.RESET_ALL}")
        except Exception as e:
            print(f"{Fore.RED}保存失败: {str(e)}{Style.RESET_ALL}")
    else:
        diagnoser.print_report()

查看全文

http://www.dtcms.com/a/48188.html