当前位置: 首页 > news >正文

用wordpress仿站企业宣传方案模板

用wordpress仿站,企业宣传方案模板,北京seo优化推广,免备案空间什么意思简述 本文面向拥有CUDA知识背景并有快速实现layernorm backward需求的读者,若想详细了解layernorm backward计算原理、优化细节请移步参考链接中的文章,本文更侧重于代码实现。如有高见请不吝赐教,谢谢! 很多大佬已经对layernorm_…

简述

本文面向拥有CUDA知识背景并有快速实现layernorm backward需求的读者,若想详细了解layernorm backward计算原理、优化细节请移步参考链接中的文章,本文更侧重于代码实现。如有高见请不吝赐教,谢谢!

很多大佬已经对layernorm_bwd原理、优化方法有过详细讲解(参考链接),这里不再赘述,只是对layernorm_bwd常用优化方法代码复现。

1. layernorm_bwd算法原理及cpu实现

  • layernorm_bwd公式推导:
    在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

template<typename T, typename T_ACC>
void layernorm_backward_cpu(T* dinput, T* dweight, T* dbias, T* doutput,T* input, T* weight, T_ACC* mean, T_ACC* rstd,const int batch, const int seq_len, const int hidden_dim)
{for(int b=0; b<batch; b++){for(int i=0; i<seq_len; i++){const T* doutput_offset = doutput + b * seq_len * hidden_dim + i * hidden_dim;T* dinput_offset = dinput + b * seq_len * hidden_dim + i * hidden_dim;const T* input_offset = input + b * seq_len * hidden_dim + i * hidden_dim;const T_ACC mean_val = mean[b * seq_len + i];const T_ACC rstd_val = rstd[b * seq_len + i]; T dnorm_mean = 0.0f;T dnorm_norm_mean = 0.0f;for(int j = 0; j<hidden_dim; j++){T norm_bti = (input_offset[j] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[j] * doutput_offset[j];dnorm_mean += dnorm_i;dnorm_norm_mean += dnorm_i * norm_bti;}dnorm_mean = dnorm_mean / static_cast<T>(hidden_dim);dnorm_norm_mean = dnorm_norm_mean / static_cast<T>(hidden_dim);for(int j = 0; j<hidden_dim; j++){T norm_bti = (input_offset[j] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[j] * doutput_offset[j];// gradient to biasdbias[j] += doutput_offset[j];// gradient to weightdweight[j] += norm_bti * doutput_offset[j];// gradient to inputT dval = 0.0f;dval += dnorm_i;dval -= dnorm_mean;dval -= norm_bti * dnorm_norm_mean;dval *= rstd_val;dinput_offset[j] += dval;}}}
}

2. layernorm_bwd cuda优化方法及实现

2.1 layernorm_bwd

  • 优化方法:v1版本是每个线程计算一行数据,即一共有batch*seq_len个线程,每个线程循环计算hidden_dim个数据;
template<typename T, typename T_ACC>
__global__ void layernorm_backward_kernel1(T* dinput, T* dweight, T* dbias, const T* doutput,T* input, T* weight, T_ACC* mean, T_ACC* rstd,const int batch, const int seq_len, const int hidden_dim)
{int idx = blockIdx.x * blockDim.x + threadIdx.x;if(idx < batch * seq_len){const T* doutput_offset = doutput + idx * hidden_dim;T* dinput_offset = dinput + idx * hidden_dim;const T* input_offset = input + idx * hidden_dim;const T_ACC mean_val = mean[idx];const T_ACC rstd_val = rstd[idx]; T dnorm_mean = 0.0f;T dnorm_norm_mean = 0.0f;for(int i=0; i<hidden_dim; i++){T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * doutput_offset[i];dnorm_mean += dnorm_i;dnorm_norm_mean += dnorm_i * norm_bti;}dnorm_mean = dnorm_mean / static_cast<T>(hidden_dim);dnorm_norm_mean = dnorm_norm_mean / static_cast<T>(hidden_dim);for(int i=0; i<hidden_dim; i++){T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * doutput_offset[i];// gradient to biasatomicAdd(&(dbias[i]), doutput_offset[i]);// gradient to weightatomicAdd(&(dweight[i]), norm_bti * doutput_offset[i]);// gradient to inputT dval = 0.0f;dval += dnorm_i;dval -= dnorm_mean;dval -= norm_bti * dnorm_norm_mean;dval *= rstd_val;dinput_offset[i] += dval;}}
}
	dim3 block(256, 1);dim3 grid((batch * seq_len) / block.x, 1);util::print_cuda_cfg(grid, block);layernorm_backward_kernel1<T, T_ACC><<<grid, block>>>(dinput_gpu, dweight_gpu, dbias_gpu, doutput_gpu, input_gpu, weight_gpu, mean_gpu, rstd_gpu, batch, seq_len, hidden_dim);

2.2 layernorm_fwd_v2

  • 优化方法:v2版本是每个warp计算一行数据,即一共有batch*seq_len个warp,每个warp循环计算hidden_dim个数据;warp内部会通过线程束洗牌指令计算出max值。
template <typename T>
__device__ T warpReduceSum(T val) {
#pragma unrollfor (int offset = warpSize / 2; offset > 0; offset >>= 1) {val += __shfl_xor_sync(0xFFFFFFFF, val, offset);}return val;
}template<typename T, typename T_ACC>
__global__ void layernorm_backward_kernel2(T* dinput, T* dweight, T* dbias, const T* doutput,T* input, T* weight, T_ACC* mean, T_ACC* rstd,const int batch, const int seq_len, const int hidden_dim)
{int tx = threadIdx.x;int by = blockIdx.y;if(by < batch * seq_len){const T* doutput_offset = doutput + by * hidden_dim;T* dinput_offset = dinput + by * hidden_dim;const T* input_offset = input + by * hidden_dim;const T_ACC mean_val = mean[by];const T_ACC rstd_val = rstd[by]; T dnorm_mean = 0.0f;T dnorm_norm_mean = 0.0f;for(int i=tx; i<hidden_dim; i+=blockDim.x){T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * doutput_offset[i];dnorm_mean += dnorm_i;dnorm_norm_mean += dnorm_i * norm_bti;}dnorm_mean = warpReduceSum<T>(dnorm_mean);dnorm_norm_mean = warpReduceSum<T>(dnorm_norm_mean);dnorm_mean = dnorm_mean / static_cast<T>(hidden_dim);dnorm_norm_mean = dnorm_norm_mean / static_cast<T>(hidden_dim);for(int i=tx; i<hidden_dim; i+=blockDim.x){T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * doutput_offset[i];// gradient to biasatomicAdd(&(dbias[i]), doutput_offset[i]);// gradient to weightatomicAdd(&(dweight[i]), norm_bti * doutput_offset[i]);// gradient to inputT dval = 0.0f;dval += dnorm_i;dval -= dnorm_mean;dval -= norm_bti * dnorm_norm_mean;dval *= rstd_val;dinput_offset[i] += dval;}}
}
	dim3 block(32, 1);dim3 grid(1, batch * seq_len);layernorm_backward_kernel2<T, T_ACC><<<grid, block>>>(dinput_gpu, dweight_gpu, dbias_gpu, doutput_gpu, input_gpu, weight_gpu, mean_gpu, rstd_gpu, batch, seq_len, hidden_dim);

2.3 layernorm_bwd_v3

  • 优化方法:基于v2版本仍采用32个线程计算一行数据,但在此版本中将doutput加载至smem中,避免对global memory多次访问。
template <typename T>
__device__ T warpReduceSum(T val) {
#pragma unrollfor (int offset = warpSize / 2; offset > 0; offset >>= 1) {val += __shfl_xor_sync(0xFFFFFFFF, val, offset);}return val;
}template<typename T, typename T_ACC>
__global__ void layernorm_backward_kernel3(T* dinput, T* dweight, T* dbias, const T* doutput,T* input, T* weight, T_ACC* mean, T_ACC* rstd,const int batch, const int seq_len, const int hidden_dim)
{int tx = threadIdx.x;int by = blockIdx.y;extern __shared__ unsigned char tmp_smem[];T *smem = reinterpret_cast<T *>(tmp_smem);if(by < batch * seq_len){const T* doutput_offset = doutput + by * hidden_dim;T* dinput_offset = dinput + by * hidden_dim;const T* input_offset = input + by * hidden_dim;const T_ACC mean_val = mean[by];const T_ACC rstd_val = rstd[by]; T dnorm_mean = 0.0f;T dnorm_norm_mean = 0.0f;for(int i=tx; i<hidden_dim; i+=blockDim.x){T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * doutput_offset[i];dnorm_mean += dnorm_i;dnorm_norm_mean += dnorm_i * norm_bti;}dnorm_mean = warpReduceSum<T>(dnorm_mean);dnorm_norm_mean = warpReduceSum<T>(dnorm_norm_mean);dnorm_mean = dnorm_mean / static_cast<T>(hidden_dim);dnorm_norm_mean = dnorm_norm_mean / static_cast<T>(hidden_dim);for(int i=tx; i<hidden_dim; i+=blockDim.x){smem[tx] = doutput_offset[i];__syncthreads();T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * smem[tx];// gradient to biasatomicAdd(&(dbias[i]), smem[tx]);// gradient to weightatomicAdd(&(dweight[i]), norm_bti * smem[tx]);// gradient to inputT dval = 0.0f;dval += dnorm_i;dval -= dnorm_mean;dval -= norm_bti * dnorm_norm_mean;dval *= rstd_val;dinput_offset[i] += dval;}}
}
	dim3 block(32, 1);dim3 grid(1, batch * seq_len);size_t smem_size = sizeof(T) * block.x;layernorm_backward_kernel3<T, T_ACC><<<grid, block, smem_size>>>(dinput_gpu, dweight_gpu, dbias_gpu, doutput_gpu, input_gpu, weight_gpu, mean_gpu, rstd_gpu, batch, seq_len, hidden_dim);

2.4 layernorm_fwd_v4

  • 优化方法:基于v3版本,v4版本让1024个线程循环计算一行。
template <typename T>
__device__ T warpReduceSum(T val) {
#pragma unrollfor (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {val += __shfl_xor_sync(0xFFFFFFFF, val, offset);}return val;
}template<typename T>
__device__ __inline__ T blockReduceSum(T val){__shared__ T shared[WARP_SIZE];__shared__ T ret;int warp_id = threadIdx.x / WARP_SIZE;int lane_id = threadIdx.x % WARP_SIZE;val = warpReduceSum(val);if(lane_id == 0){shared[warp_id] = val;}__syncthreads();val = (threadIdx.x < WARP_SIZE) ? shared[threadIdx.x] : (T)(0.0f);val = warpReduceSum(val);if (threadIdx.x == 0){ret = val;}__syncthreads();return ret;
}template<typename T, typename T_ACC>
__global__ void layernorm_backward_kernel4(T* dinput, T* dweight, T* dbias, const T* doutput,T* input, T* weight, T_ACC* mean, T_ACC* rstd,const int batch, const int seq_len, const int hidden_dim)
{int tx = threadIdx.x;int by = blockIdx.y;extern __shared__ unsigned char tmp_smem[];T *smem = reinterpret_cast<T *>(tmp_smem);if(by < batch * seq_len){const T* doutput_offset = doutput + by * hidden_dim;T* dinput_offset = dinput + by * hidden_dim;const T* input_offset = input + by * hidden_dim;const T_ACC mean_val = mean[by];const T_ACC rstd_val = rstd[by]; T dnorm_mean = 0.0f;T dnorm_norm_mean = 0.0f;for(int i=tx; i<hidden_dim; i+=blockDim.x){T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * doutput_offset[i];dnorm_mean += dnorm_i;dnorm_norm_mean += dnorm_i * norm_bti;}dnorm_mean = blockReduceSum<T>(dnorm_mean);dnorm_norm_mean = blockReduceSum<T>(dnorm_norm_mean);dnorm_mean = dnorm_mean / static_cast<T>(hidden_dim);dnorm_norm_mean = dnorm_norm_mean / static_cast<T>(hidden_dim);for(int i=tx; i<hidden_dim; i+=blockDim.x){smem[tx] = doutput_offset[i];__syncthreads();T norm_bti = (input_offset[i] - static_cast<T>(mean_val)) * static_cast<T>(rstd_val);T dnorm_i = weight[i] * smem[tx];// gradient to biasatomicAdd(&(dbias[i]), smem[tx]);// gradient to weightatomicAdd(&(dweight[i]), norm_bti * smem[tx]);// gradient to inputT dval = 0.0f;dval += dnorm_i;dval -= dnorm_mean;dval -= norm_bti * dnorm_norm_mean;dval *= rstd_val;dinput_offset[i] += dval;}}
}
 	dim3 block(1024, 1);dim3 grid(1, batch * seq_len);size_t smem_size = sizeof(T) * block.x;util::print_cuda_cfg(grid, block);layernorm_backward_kernel4<T, T_ACC><<<grid, block, smem_size>>>(dinput_gpu, dweight_gpu, dbias_gpu, doutput_gpu, input_gpu, weight_gpu, mean_gpu, rstd_gpu, batch, seq_len, hidden_dim);

2.5 layernorm_bwd其他优化方法

v4版本的性能瓶颈是对dbias和dweight进行atomicAdd计算,这样对于dbias和dweight每一个内存位置都有batch * seq_len个线程串行的进行累加计算,是较为耗时的操作。因此可以让block(1024, 1)计算多行,先将每个block负责计算行的smem[tx]和norm_bti × smem[tx]结果累加到寄存器中,然后再将多个block存在寄存器中的值进行atomicAdd计算,这样可以减少需要执行atomicAdd线程的数量,减少串行执行操作,从而提升性能。

3. layernorm_bwd 不同版本性能对比

数据类型及规模: FP32 16 64 2048
硬件平台:A100-SXM

layernorm_bwd versioncycle
layernorm_bwd7482424
layernorm_bwd251740
layernorm_bwd253976
layernorm_bwd98369

参考链接

序号链接备注
1https://zhuanlan.zhihu.com/p/694974164layernorm cuda 代码实现
2https://www.jianshu.com/p/db89d62e1974layernorm 反向推导公式
http://www.dtcms.com/a/602468.html

相关文章:

  • 使用Netlify部署前端项目
  • 网站设计结构图用什么做丝芭传媒有限公司
  • pagehide/beforeunload / unload / onUnmounted 执行顺序与navigator.sendBeacon使用陷阱详解
  • 解决若依框架点击菜单无效的问题(或者main主体白板)vue3版本
  • 回溯-22括号生成
  • 如何做网站卖衣服第一营销网
  • 怎么写网站建设的说明线上设计师是什么意思
  • 力扣(LeetCode) ——43.字符串相乘(C++)
  • 哪里有做网站服务世安建设有限网站
  • 目前哪些企业需要做网站建设的呢企业网站优化兴田德润优惠
  • strchr函数
  • 做图书网站赚钱么关于网站建设的名言
  • Xen PVH 模式启动 Dom0 配置文档
  • 26_FastMCP 2.x 中文文档之FastMCP服务端部署:HTTP 部署指南
  • cisp-pte之SQL注入题之vulnerabilities/fu1.php?id=1
  • 发布三小时,GitHub标星11K,华为内部的图解网络笔记限时开源
  • 【electron】解决CS里的全屏问题
  • 手机网站建站软件毕设做桌面软件 网站
  • 【1.8】基于FPGA的costas环开发2——解调端下变频模块
  • 电商培训机构哪家好网站优化一年多少钱
  • 专业手机网站建设设计软文案例400字
  • pyautocad 获取obb最小包围矩形后旋转平行后标注长宽
  • Google ADK、OpenAI Agents SDK 和 AgentScope的详细对比
  • 深入解析MySQL数据库报错:`ERROR 1146 (42S02) Table ‘mysql.user‘ doesn‘t exist`
  • 用C语言编写有趣程序 | 探索如何用编程创造乐趣与实用工具
  • 武城网站建设公司谷歌广告代理商
  • Docker是什么?怎么安装与配置?
  • 搭建网站的步骤wordpress地址改不了
  • 手机网站报价表网站建设规划书实训报告
  • (Linux操作系统)MySQL在Centos7环境安装和MySQL数据库基础