当前位置：首页 > news >正文

CUDA编程 - 如何在 GPU 上使用 C++ 函数重载 - cppOverload

news 2025/7/6 0:05:06

这里写目录标题

一、完整代码与例程目的
二、代码拆解与复用
- - 2.1、函数重载：
  - 2.2、函数指针声明：
  - 2.3、函数指针赋值与内核启动：
  - 2.4、CUDA API调用：
  - - 2.4.1、cudaFuncSetCacheConfig：
    - 2.4.2、cudaFuncGetAttributes：
    - 2.4.3、数据回传与验证：

一、完整代码与例程目的

完整代码地址：https://github.com/NVIDIA/cuda-samples/tree/v11.8/Samples/0_Introduction/cppOverload

此示例演示如何在 GPU 上使用 C++ 函数重载。

完整代码

#define THREAD_N 256
#define N 1024
#define DIV_UP(a, b) (((a) + (b) - 1) / (b))// Includes, system
#include <stdio.h>
#include <helper_cuda.h>
#include <helper_string.h>
#include <helper_math.h>__global__ void simple_kernel(const int *pIn, int *pOut, int a) {__shared__ int sData[THREAD_N];int tid = threadIdx.x + blockDim.x * blockIdx.x;sData[threadIdx.x] = pIn[tid];__syncthreads();pOut[tid] = sData[threadIdx.x] * a + tid;;
}__global__ void simple_kernel(const int2 *pIn, int *pOut, int a) {__shared__ int2 sData[THREAD_N];int tid = threadIdx.x + blockDim.x * blockIdx.x;sData[threadIdx.x] = pIn[tid];__syncthreads();pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;;
}const char *sampleName = "C++ Function Overloading";#define OUTPUT_ATTR(attr)                                         \printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);       \printf("Constant Size: %d\n", (int)attr.constSizeBytes);        \printf("Local Size:    %d\n", (int)attr.localSizeBytes);        \printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \printf("Number of Registers: %d\n", attr.numRegs);              \printf("PTX Version: %d\n", attr.ptxVersion);                   \printf("Binary Version: %d\n", attr.binaryVersion);bool check_func1(int *hInput, int *hOutput, int a) {for (int i = 0; i < N; ++i) {int cpuRes = hInput[i] * a + i;if (hOutput[i] != cpuRes) {return false;}}return true;
}bool check_func2(int2 *hInput, int *hOutput, int a) {for (int i = 0; i < N; i++) {int cpuRes = (hInput[i].x + hInput[i].y) * a + i;if (hOutput[i] != cpuRes) {return false;}}return true;
}bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a) {for (int i = 0; i < N; i++) {if (hOutput[i] != (hInput1[i] + hInput2[i]) * a + i) {return false;}}return true;
}int main(int argc, const char *argv[]) {int *hInput = NULL;int *hOutput = NULL;int *dInput = NULL;int *dOutput = NULL;printf("%s starting...\n", sampleName);int deviceCount;checkCudaErrors(cudaGetDeviceCount(&deviceCount));printf("Device Count: %d\n", deviceCount);int deviceID = findCudaDevice(argc, argv);cudaDeviceProp prop;checkCudaErrors(cudaGetDeviceProperties(&prop, deviceID));if (prop.major < 2) {printf("ERROR: cppOverload requires GPU devices with compute SM 2.0 or ""higher.\n");printf("Current GPU device has compute SM%d.%d, Exiting...", prop.major,prop.minor);exit(EXIT_WAIVED);}checkCudaErrors(cudaSetDevice(deviceID));// Allocate device memorycheckCudaErrors(cudaMalloc(&dInput, sizeof(int) * N * 2));checkCudaErrors(cudaMalloc(&dOutput, sizeof(int) * N));// Allocate host memorycheckCudaErrors(cudaMallocHost(&hInput, sizeof(int) * N * 2));checkCudaErrors(cudaMallocHost(&hOutput, sizeof(int) * N));for (int i = 0; i < N * 2; i++) {hInput[i] = i;}// Copy data from host to devicecheckCudaErrors(cudaMemcpy(dInput, hInput, sizeof(int) * N * 2, cudaMemcpyHostToDevice));// Test C++ overloadingbool testResult = true;bool funcResult = true;int a = 1;void (*func1)(const int *, int *, int);void (*func2)(const int2 *, int *, int);void (*func3)(const int *, const int *, int *, int);struct cudaFuncAttributes attr;// overload function 1func1 = simple_kernel;memset(&attr, 0, sizeof(attr));checkCudaErrors(cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared));checkCudaErrors(cudaFuncGetAttributes(&attr, *func1));OUTPUT_ATTR(attr);(*func1)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dOutput, a);checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int) * N, cudaMemcpyDeviceToHost));funcResult = check_func1(hInput, hOutput, a);printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n",funcResult ? "PASSED" : "FAILED");testResult &= funcResult;// overload function 2func2 = simple_kernel;memset(&attr, 0, sizeof(attr));checkCudaErrors(cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared));checkCudaErrors(cudaFuncGetAttributes(&attr, *func2));OUTPUT_ATTR(attr);(*func2)<<<DIV_UP(N, THREAD_N), THREAD_N>>>((int2 *)dInput, dOutput, a);checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int) * N, cudaMemcpyDeviceToHost));funcResult = check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a);printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n",funcResult ? "PASSED" : "FAILED");testResult &= funcResult;// overload function 3func3 = simple_kernel;memset(&attr, 0, sizeof(attr));checkCudaErrors(cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared));checkCudaErrors(cudaFuncGetAttributes(&attr, *func3));OUTPUT_ATTR(attr);(*func3)<<<DIV_UP(N, THREAD_N), THREAD_N>>>(dInput, dInput + N, dOutput, a);checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int) * N, cudaMemcpyDeviceToHost));funcResult = check_func3(&hInput[0], &hInput[N], hOutput, a);printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) ""%s\n\n",funcResult ? "PASSED" : "FAILED");testResult &= funcResult;checkCudaErrors(cudaFree(dInput));checkCudaErrors(cudaFree(dOutput));checkCudaErrors(cudaFreeHost(hOutput));checkCudaErrors(cudaFreeHost(hInput));checkCudaErrors(cudaDeviceSynchronize());exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

二、代码拆解与复用

2.1、函数重载：

三个同名的__global__ 函数 simple_kernel 通过不同的参数列表实现重载：

__global__ void simple_kernel(const int *pIn, int *pOut, int a) {__shared__ int sData[THREAD_N];int tid = threadIdx.x + blockDim.x * blockIdx.x;sData[threadIdx.x] = pIn[tid];__syncthreads();pOut[tid] = sData[threadIdx.x] * a + tid;;
}__global__ void simple_kernel(const int2 *pIn, int *pOut, int a) {__shared__ int2 sData[THREAD_N];int tid = threadIdx.x + blockDim.x * blockIdx.x;sData[threadIdx.x] = pIn[tid];__syncthreads();pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;;
}

(const int*, int*, int)

(const int2*, int*, int)

(const int*, const int*, int*, int)

2.2、函数指针声明：

void (*func1)(const int *, int *, int); // 指向第一个内核的指针

void (*func2)(const int2 *, int *, int); // 指向第二个内核的指针

void (*func3)(const int *, const int *, int *, int); // 指向第三个内核的指针

 void (*func1)(const int *, int *, int);void (*func2)(const int2 *, int *, int);void (*func3)(const int *, const int *, int *, int);

这些指针根据参数类型匹配对应的重载内核。

2.3、函数指针赋值与内核启动：

通过func1 = simple_kernel;将函数指针指向特定重载版本，编译器根据指针类型自动选择匹配的内核。

启动内核时使用(*func1)<<<...>>>(...)，解引用函数指针并传递执行配置（网格/块大小）和参数。

语法：将CUDA内核函数地址赋值给函数指针
用意：利用C++函数重载特性，根据函数指针func1的类型void () (const int,int*,int)，
编译器会自动选择参数匹配的simple_kernel重载版本
特殊点：CUDA内核函数地址需要通过 & 获取，但编译器允许隐式转换（此处*func1 等价于 simple_kernel ）

2.4、CUDA API调用：

2.4.1、cudaFuncSetCacheConfig：

设置内核的共享内存缓存偏好：

cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared);

API作用：为内核函数配置L1缓存/共享内存策略
参数： cudaFuncCachePreferShared：优先分配共享内存（48KB共享内存 + 16KB L1缓存）
优化目的：针对该内核需要大量共享内存访问的特性，通过调整缓存分配策略提升性能

2.4.2、cudaFuncGetAttributes：

获取内核属性（如寄存器使用量、共享内存大小等）。

cudaFuncGetAttributes(&attr, *func1);

struct cudaFuncAttributes {size_t sharedSizeBytes;    // 每个Block使用的共享内存size_t constSizeBytes;     // 常量内存使用量size_t localSizeBytes;      // 局部内存使用量int maxThreadsPerBlock;    // 该函数支持的最大线程数/Blockint numRegs;               // 寄存器使用量int ptxVersion;            // PTX版本int binaryVersion;         // 二进制版本int cacheModeCA;           // 缓存模式
};

调试用途：

验证寄存器使用是否符合预期
检查共享内存是否超限（例如超过48KB会启动失败）
适配不同计算能力的设备

2.4.3、数据回传与验证：

cudaMemcpy(hOutput, dOutput, ..., cudaMemcpyDeviceToHost);
funcResult = check_func1(...);
testResult &= funcResult;

安全操作：

cudaMemcpy 同步传输数据，确保内核执行完成
check_func1 自定义函数验证计算结果正确性

累积验证：

testResult &= …通过按位与运算累积所有测试用例的结果，只有所有测试通过时最终结果才为真

查看全文

http://www.dtcms.com/a/175079.html

C++学习知识点汇总

前端正则学习记录

Winform（12.控件讲解）

解决Hyper-V无法启动Debian 12虚拟机

Android Retrofit框架分析(三)：自动切换回主线程；bulid的过程；create方法+ServiceMethod源码了解

Webview通信系统学习指南

通过Config批量注入对象到Spring IoC容器

Qt开发经验 --- 避坑指南（4）

十分钟了解 @MapperScan

LeetCode 热题 100 22. 括号生成

大学之大：隆德大学2025.5.6

JSON 转换为 Word 文档

SLAM算法工程师面经大全：2025年面试真题解析与实战指南

个人Unity自用面经（未完）

Three.js 基础与实践

JavaSE核心知识点01基础语法01-04（数组）

QQMUSIC测试报告

双目标清单——AI与思维模型【96】

智能机器人赋能小天互连IM系统，打造高效办公新生态

cephadm部署ceph集群

Flowable7.x学习笔记（二十）查看流程办理进度图

从零开始学习人工智能Day6-Python3标准库概览

【AI提示词】六顶思考帽工具专家

智能学习空间的范式革新：基于AI驱动的自习室系统架构与应用研究

使用DevTools工具调试前端页面，便捷脚本，鸿蒙调试webView

PyTorch常用命令详解：助力深度学习开发

day18-API（常见API，对象克隆）

yolov8 输出数据解释

游戏的TypeScript(6)TypeScript的元编程

Spring 中四种常见初始化方法，对比 static {} 和 @PostConstruct 在并发，Spring 加载顺序大致为： JVM 加载类

这里写目录标题

一、完整代码与例程目的

二、代码拆解与复用

​ 2.1、函数重载​​：

​ 2.2、函数指针声明​​：

​ 2.3、函数指针赋值与内核启动​​：

​ 2.4、CUDA API调用​​：