当前位置: 首页 > news >正文

基于python跨平台硬件诊断的工具

基于python使用DiamondDiagnoser等,对系统信息进行的一个收集,NVIDIA GPU的详细状态,包括温度、利用率、显存使用情况等。

"""
DIAMOND - Deep Insight And Monitoring of Accelerated Neural Devices
跨平台硬件诊断工具 v2.1
"""
import os
import sys
import json
import time
import platform
import argparse
import subprocess
from datetime import datetime
from typing import Dict, List, Optional, Tuple

try:
    import torch
    import chardet
    from colorama import Fore, Style, init
except ImportError as e:
    print(f"缺少依赖库: {e}")
    sys.exit(1)

# 初始化颜色输出
init(autoreset=True)


class DiamondDiagnoser:
    def __init__(self, output_format: str = "text"):
        self.output_format = output_format
        self.report = {
            "meta": {
                "tool": "DIAMOND",
                "version": "2.1",
                "generated_at": datetime.utcnow().isoformat() + "Z"
            },
            "system": {},
            "gpu": {},
            "errors": []
        }

    def _log_error(self, context: str, error: Exception):
        """统一错误日志记录"""
        error_entry = {
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "context": context,
            "type": type(error).__name__,
            "message": str(error)
        }
        self.report["errors"].append(error_entry)

    def _run_cmd(self, cmd: List[str], timeout: int = 15) -> Tuple[bool, str]:
        """安全执行系统命令"""
        try:
            result = subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                timeout=timeout,
                check=True
            )

            # 智能编码检测
            encoding = chardet.detect(result.stdout)['encoding'] or 'utf-8'
            return True, result.stdout.decode(encoding).strip()
        except subprocess.CalledProcessError as e:
            return False, f"Command failed (code {e.returncode}): {e.output.decode(errors='replace')}"
        except Exception as e:
            self._log_error(f"执行命令 {' '.join(cmd)}", e)
            return False, str(e)

    def _get_platform_info(self):
        """获取系统级信息"""
        try:
            self.report["system"] = {
                "platform": {
                    "system": platform.system(),
                    "release": platform.release(),
                    "version": platform.version(),
                    "machine": platform.machine()
                },
                "python": {
                    "version": platform.python_version(),
                    "implementation": platform.python_implementation(),
                    "compiler": platform.python_compiler()
                },
                "environment": {
                    "CUDA_HOME": os.getenv("CUDA_HOME"),
                    "CUDA_PATH": os.getenv("CUDA_PATH"),
                    "CUDA_VISIBLE_DEVICES": os.getenv("CUDA_VISIBLE_DEVICES")
                }
            }
        except Exception as e:
            self._log_error("收集系统信息", e)

    def _get_pytorch_info(self):
        """获取PyTorch深度信息"""
        try:
            torch_info = {
                "version": torch.__version__,
                "cuda_available": torch.cuda.is_available(),
                "devices": []
            }

            if torch_info["cuda_available"]:
                for i in range(torch.cuda.device_count()):
                    prop = torch.cuda.get_device_properties(i)
                    torch_info["devices"].append({
                        "name": prop.name,
                        "capability": f"{prop.major}.{prop.minor}",
                        "total_memory": prop.total_memory,
                        "multiprocessors": prop.multi_processor_count
                    })

                    # 实时内存状态
                    torch_info["memory"] = {
                        "allocated": torch.cuda.memory_allocated(i),
                        "reserved": torch.cuda.memory_reserved(i),
                        "cached": torch.cuda.memory_reserved(i) - torch.cuda.memory_allocated(i)
                    }

            self.report["gpu"]["pytorch"] = torch_info
        except Exception as e:
            self._log_error("获取PyTorch信息", e)

    def _get_nvidia_smi(self):
        """深度解析NVIDIA-SMI信息"""
        try:
            success, output = self._run_cmd(["nvidia-smi",
                                             "--query-gpu=index,name,pci.bus_id,driver_version,temperature.gpu,utilization.gpu,memory.total,memory.used",
                                             "--format=csv,noheader,nounits"])

            gpu_list = []
            if success:
                for line in output.split('\n'):
                    fields = [f.strip() for f in line.split(', ')]
                    if len(fields) == 8:
                        gpu_list.append({
                            "index": fields[0],
                            "name": fields[1],
                            "pci_bus": fields[2],
                            "driver": fields[3],
                            "temperature": f"{fields[4]}°C",
                            "utilization": f"{fields[5]}%",
                            "memory": {
                                "total": f"{int(fields[6]) / 1024:.1f} GB",
                                "used": f"{int(fields[7]) / 1024:.1f} GB",
                                "free": f"{(int(fields[6]) - int(fields[7])) / 1024:.1f} GB"
                            }
                        })

            self.report["gpu"]["nvidia"] = {
                "detected": len(gpu_list) > 0,
                "count": len(gpu_list),
                "details": gpu_list
            }
        except Exception as e:
            self._log_error("解析NVIDIA-SMI", e)

    def _get_advanced_checks(self):
        """执行深度硬件检查"""
        # PCI设备检查
        try:
            if platform.system() == "Linux":
                success, lspci = self._run_cmd(["lspci", "-nnk"])
                if success:
                    self.report["hardware"] = {
                        "pci_devices": [line.strip() for line in lspci.split('\n') if "VGA" in line or "3D" in line]
                    }
        except Exception as e:
            self._log_error("PCI设备检查", e)

        # 驱动兼容性检查
        try:
            if torch.cuda.is_available() and self.report["gpu"]["nvidia"]["detected"]:
                cuda_version = torch.version.cuda
                success, driver = self._run_cmd(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"])
                if success:
                    self.report["compatibility"] = {
                        "cuda_version": cuda_version,
                        "driver_version": driver.strip(),
                        "compatible": self._check_driver_compatibility(cuda_version, driver)
                    }
        except Exception as e:
            self._log_error("驱动兼容性检查", e)

    def _check_driver_compatibility(self, cuda_ver: str, driver_ver: str) -> bool:
        """验证驱动与CUDA版本兼容性"""
        try:
            # 转换版本号为数字
            cuda_major = int(cuda_ver.split('.')[0])
            driver_major = int(driver_ver.split('.')[0])

            # CUDA与驱动版本兼容规则
            return driver_major >= {
                12: 525, 11: 450, 10: 410
            }.get(cuda_major, 0)
        except:
            return False

    def generate_report(self):
        """生成完整诊断报告"""
        self._get_platform_info()
        self._get_pytorch_info()
        self._get_nvidia_smi()
        self._get_advanced_checks()
        return self.report

    def _format_memory(self, bytes_val: int) -> str:
        """智能内存格式化"""
        for unit in ['B', 'KiB', 'MiB', 'GiB']:
            if bytes_val < 1024:
                return f"{bytes_val:.2f} {unit}"
            bytes_val /= 1024
        return f"{bytes_val:.2f} TiB"

    def print_report(self):
        """专业级控制台输出"""
        if self.output_format == "json":
            print(json.dumps(self.report, indent=2))
            return

        # 彩色控制台输出
        print(f"\n{Fore.BLUE}=== DIAMOND 硬件诊断报告 ==={Style.RESET_ALL}")
        print(f"{Fore.CYAN}生成时间:{Style.RESET_ALL} {self.report['meta']['generated_at']}")

        # 系统信息
        print(f"\n{Fore.YELLOW}◆ 系统概览{Style.RESET_ALL}")
        sys_info = self.report['system']['platform']
        print(f"操作系统: {sys_info['system']} {sys_info['release']} ({sys_info['machine']})")
        print(
            f"Python环境: {self.report['system']['python']['version']} ({self.report['system']['python']['compiler']})")

        # GPU信息
        print(f"\n{Fore.YELLOW}◆ 加速器状态{Style.RESET_ALL}")
        if self.report["gpu"]["nvidia"]["detected"]:
            for gpu in self.report["gpu"]["nvidia"]["details"]:
                print(f"{Fore.GREEN}GPU {gpu['index']}: {gpu['name']}{Style.RESET_ALL}")
                print(f"├─ PCI总线: {gpu['pci_bus']}")
                print(f"├─ 驱动版本: {gpu['driver']}")
                print(f"├─ 温度: {gpu['temperature']}")
                print(f"├─ 利用率: {gpu['utilization']}")
                print(f"└─ 显存: {gpu['memory']['used']} / {gpu['memory']['total']} (剩余 {gpu['memory']['free']})")
        else:
            print(f"{Fore.RED}× 未检测到NVIDIA GPU{Style.RESET_ALL}")

        # PyTorch信息
        print(f"\n{Fore.YELLOW}◆ PyTorch 运行时{Style.RESET_ALL}")
        torch_info = self.report["gpu"]["pytorch"]
        if torch_info["cuda_available"]:
            print(f"{Fore.GREEN}✓ CUDA 可用 ({torch.version.cuda}){Style.RESET_ALL}")
            for idx, device in enumerate(torch_info["devices"]):
                print(f"设备 {idx}: {device['name']}")
                print(f"├─ 计算能力: {device['capability']}")
                print(f"├─ 流处理器: {device['multiprocessors']}")
                print(f"└─ 总显存: {self._format_memory(device['total_memory'])}")
        else:
            print(f"{Fore.RED}× PyTorch CUDA 不可用{Style.RESET_ALL}")

        # 兼容性检查
        if "compatibility" in self.report:
            print(f"\n{Fore.YELLOW}◆ 兼容性验证{Style.RESET_ALL}")
            compat = self.report["compatibility"]
            status = f"{Fore.GREEN}✓ 兼容" if compat["compatible"] else f"{Fore.RED}× 不兼容"
            print(f"{status}{Style.RESET_ALL}")
            print(f"CUDA版本: {compat['cuda_version']}")
            print(f"驱动版本: {compat['driver_version']}")

        # 错误报告
        if self.report["errors"]:
            print(f"\n{Fore.RED}◆ 错误日志 ({len(self.report['errors'])}){Style.RESET_ALL}")
            for error in self.report["errors"]:
                print(f"[{error['timestamp']}] {error['context']}: {error['message']}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="DIAMOND - 深度硬件诊断工具")
    parser.add_argument("-f", "--format", choices=["text", "json"], default="text", help="输出格式")
    parser.add_argument("-o", "--output", help="保存报告到文件")
    args = parser.parse_args()

    diagnoser = DiamondDiagnoser(output_format=args.format)
    report = diagnoser.generate_report()

    if args.output:
        try:
            with open(args.output, 'w') as f:
                if args.format == "json":
                    json.dump(report, f, indent=2)
                else:
                    diagnoser.print_report()
                    f.write(str(report))
            print(f"{Fore.GREEN}报告已保存至: {args.output}{Style.RESET_ALL}")
        except Exception as e:
            print(f"{Fore.RED}保存失败: {str(e)}{Style.RESET_ALL}")
    else:
        diagnoser.print_report()

相关文章:

  • 刷题 | 牛客 - js入门15题(更ing)5/15知识点解答
  • ubuntu 启动不起来,光标闪烁 解决方法
  • 杰和科技工业整机AF208|防尘+静音+全天候运行
  • GPU/CUDA 发展编年史:从 3D 渲染到 AI 大模型时代
  • 谈谈 HTTPS 的工作原理,SSL / TLS 握手流程是什么?
  • RabbitMQ怎么实现延时支付?
  • C++:内联函数
  • Linux常用指令
  • VirtualBox虚拟机安装Mac OS启动后的系统设置
  • 指纹细节提取(Matlab实现)
  • Java 大视界 -- Java 大数据在智能教育考试评估与学情分析中的应用(112)
  • RV1126的OSD模块和SDL_TTF结合输出H264文件
  • Elasticsearch简单学习
  • 电子电路中,正负双电源供电的需求原因
  • excel 斜向拆分单元格
  • 第51天:Web开发-JavaEE应用SpringBoot栈身份验证JWT令牌Security鉴权安全绕过
  • Webpack、Vite区别知多少?
  • 单片机学习规划
  • Java学习——day14
  • 成功解决 “\ufeffimport sys“ SyntaxError: invalid character in identifier
  • 西部航空回应飞机上卖彩票:与重庆福彩合作,仅部分航班售卖
  • 神舟十九号航天员乘组平安抵京
  • 欢迎回家!神十九返回舱成功着陆
  • 陈文清:推进扫黑除恶常态化走深走实,有力回应人民群众对安居乐业的新期待
  • 总有黑眼圈是因为“虚”吗?怎么睡才能改善?
  • 北京公园使用指南