当前位置：首页 > wzjs >正文

深圳入户哈尔滨seo

wzjs 2025/8/11 21:10:39

深圳入户,哈尔滨seo,做网站都需要具备什么,佛山网站建设外包公司引言本文涉及两个框架及其版本分别为 NNI (Neural Network Intelligence) ：3.0TensorRT：10.9.0.34 NNI 在文档 Speed Up Quantized Model with TensorRT里描述了如何使用 TensorRT 为NNI量化的模型实现加速，但是从NNI 的源代码https://gi…

引言

本文涉及两个框架及其版本分别为

NNI (Neural Network Intelligence) ：3.0
TensorRT：10.9.0.34

NNI 在文档 Speed Up Quantized Model with TensorRT里描述了如何使用 TensorRT 为NNI量化的模型实现加速，但是从NNI 的源代码https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/integrated_tensorrt.py 来看：

https://github.com/microsoft/nni/blob/767ed7f22e1e588ce76cbbecb6c6a4a76a309805/nni/compression/quantization_speedup/integrated_tensorrt.py#L14

TRT8 = 8

https://github.com/microsoft/nni/blob/767ed7f22e1e588ce76cbbecb6c6a4a76a309805/nni/compression/quantization_speedup/integrated_tensorrt.py#L292

assert trt_version >= TRT8, "Version of TensorRT is too old, please \update TensorRT to version >= 8.0"

来看，实际上 NNI只支持 TensorRT 8 的API，但是TensorRT后续已经更新到10了，并且NNI 已经不再更新，因此有必要实现NNI 适配 TensorRT10。

本文将参考：

API Migration Guide
个人实践经验
完成适配过程。

适配过程

修改 integrated_tensorrt.py 文件

integrated_tensorrt.py 位于https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/integrated_tensorrt.py

L291

assert trt_version >= TRT8, "Version of TensorRT is too old, please \update TensorRT to version >= 8.0"

修改为

trt_version = int(trt.__version__.split('.')[0])
assert trt_version >= TRT8, "Version of TensorRT is too old, please \update TensorRT to version >= 8.0"

L231-L232

builder.max_batch_size = input_shape[0]
trt_config.max_workspace_size = common.GiB(8)

修改为

# builder.max_batch_size = input_shape[0]
# trt_config.max_workspace_size = common.GiB(8)
trt_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 8 << 30)

L255

engine = builder.build_engine(network, trt_config)

修改为

# engine = builder.build_engine(network, trt_config)
engine_data = builder.build_serialized_network(network, trt_config)
if not engine_data:raise RuntimeError("Failed to build serialized engine.")runtime = trt.Runtime(TRT_LOGGER)
engine = runtime.deserialize_cuda_engine(engine_data)

L354

engine_input_shape = self.engine.get_tensor_shape(self.engine.get_tensor_name(0))

修改为

# engine_input_shape = self.engine.get_binding_shape(0)
engine_input_shape = self.engine.get_tensor_shape(self.engine.get_tensor_name(0))

L365

trt_outputs = common.do_inference_v2(self.context, bindings=self.bindings, inputs=self.inputs,outputs=self.outputs, stream=self.stream)

修改为

trt_outputs = common.do_inference_v2(self.engine, self.context, self.bindings, inputs=self.inputs,outputs=self.outputs, stream=self.stream)

修改 trt_pycuda.py

trt_pycuda.py 位于https://github.com/microsoft/nni/blob/master/nni/compression/quantization_speedup/trt_pycuda.py

for binding in engine:size = trt.volume(engine.get_binding_shape(binding)) # * engine.max_batch_size, batch size already indtype = trt.nptype(engine.get_binding_dtype(binding))

修改为

# ref to https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html
# modify the code to support to compatibility with TensorRT 10.0
"""
for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_sizedtype = trt.nptype(engine.get_binding_dtype(binding))
"""
for i in range(engine.num_io_tensors):tensor_name = engine.get_tensor_name(i)size = trt.volume(engine.get_tensor_shape(tensor_name))dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

L93

if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:

修改为

# if engine.binding_is_input(binding):
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:

L102

def do_inference_v2(context, bindings, inputs, outputs, stream):

修改为

def do_inference_v2(engine, context, bindings, inputs, outputs, stream):# Ref to https://docs.nvidia.com/deeplearning/tensorrt/migration-guide/index.html to # Setup tensor addressfor i in range(engine.num_io_tensors):context.set_tensor_address(engine.get_tensor_name(i), bindings[i])

L110

context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

修改为

# context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
context.execute_async_v3(stream_handle=stream.handle)

实验

我们试着在NNI上对ResNet18 进行量化，然后使用TensorRT 进行加速验证

实验准备

实验平台信息

This Benchmark is running on the following Hardware:
CPU Information:
CPU Brand: Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz
CPU Architecture: X86_64
CPU Count: 40GPU Information:
GPU Device: Tesla T4
GPU Count: 1
CUDA Version: 12.1
GPU Memory Usage:Allocated: 0.00MBCached: 0.00MB
This Benchmark is running on the following Software:
PyTorch Version: 2.4.1+cu121
ONNX Version: 1.17.0
ONNXRuntime Version: 1.19.2
TensorRT Version: 10.9.0.34

代码

import torch
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models as modelsmodel = models.resnet18(pretrained=True)
BATCH_SIZE = 32
NUM_CLASSES = 1000INPUT_SHAPE = (BATCH_SIZE, 3, 32, 32)
OUTPUT_SHAPE = (BATCH_SIZE, NUM_CLASSES)dummy_input = torch.randn(BATCH_SIZE, 3, 32, 32)
cnn_model_onnx_save_path = "resnet18_pytorch.onnx"torch.onnx.export(model,dummy_input,  # 例如 shape=(1,3,224,224)cnn_model_onnx_save_path,input_names=["input"],output_names=["output"],dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},opset_version=11,
)import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np
import torch.onnx
import torchvision.models as models
from torchvision.models import ResNet18_Weights
import torch
import time
import onnx
import onnxruntime as ort
import cpuinfo
import matplotlib.pyplot as pltdef build_engine(onnx_path, input_shape):logger = trt.Logger(trt.Logger.WARNING)builder = trt.Builder(logger)network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))parser = trt.OnnxParser(network, logger)with open(onnx_path, "rb") as f:if not parser.parse(f.read()):for i in range(parser.num_errors):print(parser.get_error(i))raise RuntimeError("Failed to parse ONNX.")config = builder.create_builder_config()config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB# 固定输入大小为 (1, 3, 224, 224)profile = builder.create_optimization_profile()input_tensor = network.get_input(0)profile.set_shape(input_tensor.name, input_shape, input_shape, input_shape)config.add_optimization_profile(profile)engine_data = builder.build_serialized_network(network, config)if not engine_data:raise RuntimeError("Failed to build serialized engine.")runtime = trt.Runtime(logger)engine = runtime.deserialize_cuda_engine(engine_data)return engine# PyTorch inference
def pytorch_inference(model, dummy_input, num_runs=100):model.eval()# model.half()# dummy_input = dummy_input.half()with torch.no_grad():# Warmupfor _ in range(10):_ = model(dummy_input)# Benchmarktorch.cuda.synchronize()start = time.time()for _ in range(num_runs):_ = model(dummy_input)torch.cuda.synchronize()end = time.time()return (end - start) / num_runs# ONNX inference
def onnx_inference(onnx_path,dummy_input,num_runs=100,providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
):session = ort.InferenceSession(onnx_path, providers=providers)print(f"[ONNX Inference]: Using providers: {session.get_providers()} Args: {providers}")# Warmupfor _ in range(10):_ = session.run(None, {"input": dummy_input})# Benchmarkstart = time.time()for _ in range(num_runs):_ = session.run(None, {"input": dummy_input})end = time.time()return (end - start) / num_runs# TensorRT inference
def tensorrt_inference(engine, d_input, dummy_input, num_runs=100):context = engine.create_execution_context()stream = cuda.Stream()# Allocate host and device memory for outputoutput_shape = (BATCH_SIZE, 1000)  # Based on ResNet18 output shapeh_output = cuda.pagelocked_empty(output_shape, dtype=np.float32)d_output = cuda.mem_alloc(h_output.nbytes)# Create bindings# bindings = [int(d_input), int(d_output)]# Ref to https://forums.developer.nvidia.com/t/how-to-correctly-set-up-bindings-for-execute-async-v3/289924 to fix bugs about bindingscontext.set_tensor_address(engine.get_tensor_name(0), int(d_input))context.set_tensor_address(engine.get_tensor_name(1), int(d_output))# Warmupfor _ in range(10):cuda.memcpy_htod_async(d_input, dummy_input, stream)context.execute_async_v3(stream_handle=stream.handle)stream.synchronize()# Benchmarkstart = time.time()for _ in range(num_runs):cuda.memcpy_htod_async(d_input, dummy_input, stream)context.execute_async_v3(stream.handle)stream.synchronize()end = time.time()avg_time = (end - start) / num_runsreturn avg_timedef quantized_tensorrt_inference(engine, dummy_tensor, num_runs=100):total_time = 0# Warmupfor _ in range(10):output, time_span = engine.inference(dummy_tensor)# Benchmarkfor _ in range(num_runs):output, time_span = engine.inference(dummy_tensor)total_time += time_spanavg_time = total_time / num_runsreturn avg_time# Run benchmarks
# CPU Information
cpu_info = cpuinfo.get_cpu_info()
print("This Benchmark is running on the following Hardware:")
print("CPU Information:")
print(f"CPU Brand: {cpu_info['brand_raw']}")
print(f"CPU Architecture: {cpu_info['arch']}")
print(f"CPU Count: {cpu_info['count']}")# GPU Information
print("\nGPU Information:")
if torch.cuda.is_available():print(f"GPU Device: {torch.cuda.get_device_name(0)}")print(f"GPU Count: {torch.cuda.device_count()}")print(f"CUDA Version: {torch.version.cuda}")print(f"GPU Memory Usage:")print(f"  Allocated: {torch.cuda.memory_allocated(0)/1024**2:.2f}MB")print(f"  Cached: {torch.cuda.memory_reserved(0)/1024**2:.2f}MB")
else:print("No GPU available")print("This Benchmark is running on the following Software:")
print(f"PyTorch Version: {torch.__version__}")
print(f"ONNX Version: {onnx.__version__}")
print(f"ONNXRuntime Version: {ort.__version__}")
print(f"TensorRT Version: {trt.__version__}")# 1. 构建引擎
engine = build_engine(cnn_model_onnx_save_path, INPUT_SHAPE)input_nbytes = int(np.prod(INPUT_SHAPE) * np.float32().nbytes)
d_input = cuda.mem_alloc(input_nbytes)# 假设输出维度为1000
output_nbytes = int(np.prod(OUTPUT_SHAPE) * np.float32().nbytes)
d_output = cuda.mem_alloc(output_nbytes)
bindings = [int(d_input), int(d_output)]dummy_input = np.random.rand(*INPUT_SHAPE).astype(np.float32)
# dummy_input_pytorch = torch.tensor(dummy_input).cuda()tensorrt_time = tensorrt_inference(engine, d_input, dummy_input)onnx_cpu_time = onnx_inference(cnn_model_onnx_save_path, dummy_input, providers=["CPUExecutionProvider"]
)
onnx_gpu_time = onnx_inference(cnn_model_onnx_save_path, dummy_input, providers=["CUDAExecutionProvider"]
)
onnx_tensorrt_time = onnx_inference(cnn_model_onnx_save_path,dummy_input,providers=["TensorrtExecutionProvider"],
)
dummy_input_pytorch = torch.tensor(dummy_input)
pytorch_cpu_time = pytorch_inference(model.cpu(), dummy_input_pytorch.cpu())
pytorch_gpu_time = pytorch_inference(model.cuda(), dummy_input_pytorch.cuda())from tquant.quantization.quantizer import QuantizationManager
from tquant.quantization.utils import create_optimizer
from torchvision.datasets import CIFAR10
from torchvision import transformsconfig_list = [{"op_types": ["Conv2d", "Linear"],"target_names": ["weight"],"quant_dtype": "int8","quant_scheme": "affine","granularity": "default",},
]device = "cuda" if torch.cuda.is_available() else "cpu"
ptq_manager = QuantizationManager('ptq', model, config_list, device)
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
)
calibration_loader = torch.utils.data.DataLoader(dataset=CIFAR10(root="./data", train=True, download=True, transform=transform),batch_size=BATCH_SIZE,shuffle=True,
)optimizer = create_optimizer(model, optimizer_name="SGD", lr=0.001)
scheduler = Nonequantized_model, calibration_config = ptq_manager.quantize(calibration_loader, optimizer, scheduler
)
print(calibration_config)calib_data = Nonefor image, target in calibration_loader:calib_data = image.numpy()breakfrom nni.compression.quantization_speedup.calibrator import Calibrator# TensorRT processes the calibration data in the batch size of 64
calib = Calibrator(calib_data,"data/cache/calib_cache_file.cache", # Replace with your own cache file path(absolute path)batch_size=BATCH_SIZE,
)from nni.compression.quantization_speedup import ModelSpeedupTensorRTquant_engine = ModelSpeedupTensorRT(model, input_shape=INPUT_SHAPE, config=calibration_config
)
quant_engine.compress_with_calibrator(calib)quantize_tensorrt_inference_time = quantized_tensorrt_inference(quant_engine, dummy_input_pytorch
)times = [pytorch_cpu_time * 1000,onnx_cpu_time * 1000,pytorch_gpu_time * 1000,onnx_gpu_time * 1000,onnx_tensorrt_time * 1000,tensorrt_time * 1000,quantize_tensorrt_inference_time * 1000,
]
labels = ["PyTorch CPU","ONNX CPU","PyTorch GPU","ONNX GPU","ONNX TensorRT","TensorRT","TensorRT(Quantized)",
]plt.figure(figsize=(15, 10))
plt.bar(labels, times)
plt.title("Inference Time Comparison")
plt.ylabel("Time (ms)")
plt.grid(True, alpha=0.3)for i, v in enumerate(times):plt.text(i, v + 0.1, f"{v:.2f}ms", ha="center")
plt.show()import numpy as np
import seaborn as sns# Create a 5x5 matrix where each cell is time_row / time_col
n = len(times)
comparison_matrix = np.zeros((n, n))
for i in range(n):for j in range(n):comparison_matrix[i][j] = times[i] / times[j]# Create heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(comparison_matrix,annot=True,fmt=".2f",xticklabels=labels,yticklabels=labels,cmap="YlOrRd",
)
plt.title("Speed Comparison Matrix (row/column)")
plt.xlabel("Framework (denominator)")
plt.ylabel("Framework (numerator)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()