当前位置: 首页 > news >正文

天数智芯智铠100性能测试

1、背景介绍

最近新做了一个GPU项目,采用飞腾D3000+MR100组成GPU模块进行运算,MR100就是天数智芯智铠100的产品子卡。这里采用FFT运算对MR100芯片进行测试。

2、测试代码

代码采用cuda编程,和C++类似,如下:

#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>
#include <cufft.h>// clang++ GPU_IFFT_test.cu -lcudart -lcufft -L/usr/local/corex/lib64 --cuda-gpu-arch=ivcore11 -o GPU_IFFT_testvoid testFFTandiFFT(int dataSize, int numTests) 
{const int batchSize = 1;cudaSetDevice(0);  // 选择要使用的GPU设备cufftComplex* d_data;cufftComplex* h_data;cufftHandle forwardPlan, inversePlan;h_data = (cufftComplex*)malloc(dataSize * batchSize * sizeof(cufftComplex));cudaMalloc((void**)&d_data, dataSize * batchSize * sizeof(cufftComplex));// 初始化数据for (int i = 0; i < dataSize * batchSize; ++i) {h_data[i].x = static_cast<float>(rand()) / 256;h_data[i].y = 0.0f;}// 创建FFT计划cufftPlan1d(&forwardPlan, dataSize, CUFFT_C2C, batchSize);cufftPlan1d(&inversePlan, dataSize, CUFFT_C2C, batchSize);// 预热for(int i = 0; i < 10; i++){cufftExecC2C(forwardPlan, d_data, d_data, CUFFT_FORWARD);}// 执行FFT和计时float totalForwardTime = 0.0f;for (int test = 0; test < numTests; ++test) {cudaMemcpy(d_data, h_data, dataSize * batchSize * sizeof(cufftComplex), cudaMemcpyHostToDevice);cudaEvent_t start, stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start);cufftExecC2C(forwardPlan, d_data, d_data, CUFFT_FORWARD);cudaEventRecord(stop);cudaEventSynchronize(stop);float elapsedTime = 0.0f;cudaEventElapsedTime(&elapsedTime, start, stop);totalForwardTime += elapsedTime;cudaMemcpy(h_data, d_data, dataSize * batchSize * sizeof(cufftComplex), cudaMemcpyDeviceToHost);}// 计算FFT平均时间float averageForwardTime = totalForwardTime / numTests;std::cout << "Data size: " << dataSize << std::endl;std::cout << "Average Time for FFT: " << averageForwardTime << " ms" << " total times: " << numTests << std::endl;// 清理内存free(h_data);cudaFree(d_data);cufftDestroy(forwardPlan);cufftDestroy(inversePlan);
}int main() 
{const int minDataSize = 64;  // 2^6const int maxDataSize = 16777216;  // 2^24const int numTests = 1000; // Number of tests to run for each data sizefor (int dataSize = minDataSize; dataSize <= maxDataSize; dataSize *= 2) {testFFTandiFFT(dataSize, numTests);}return 0;
}

代码文件命名为GPU_IFFT_test.cu

安装好天数智芯的驱动和cuda环境,编译命令为:

clang++ GPU_IFFT_test.cu -lcudart -lcufft -L/usr/local/corex/lib64 --cuda-gpu-arch=ivcore11 -o GPU_IFFT_test

这样就可以编译出可执行文件了

3、测试结果

这里列出三种测试结果,前两种分别是MR100在不降频的模式下的结果以及降频模式下的结果,最后一列为D3000上运行FFT的结果,作为对比

序号FFT点数大小耗时(未降频1500M)耗时(降频1000M)D3000
16412.3us15.6us
212812.5us15.6us
325614.7us18.4us
451213.5us17.6us
5102413.6us17.7us11.690 us
6204814.7us19.4us24.915 us
7409614.9us19.7us54.451 us
8819216.2us21.2us118.238 us
91638420.2us27.8us293.759 us
103276825.5us33.5us707.804 us
116553626.6us33.9us1459.291 us
1213107228.1us35.2us3094.575 us
1326214432.3us42.6us7873.657 us
1452428849.5us69.1us18128.254 us
15104857691.2us131.1us42575.336 us
162097152270.7us360.5us116483.516 us
174194304680.6us867.9us
1883886082687.7us3640.8us
19167772166778.4us9036.9us

从上述表格可以看出,在FFT点数超过1K点后,GPU的FFT性能明显强于CPU。虽然D3000主频是2.5G,比GPU要高,但GPU明显更适合进行FFT这种并发性高的计算。

附:D3000上面运行的FFT代码如下,这里要用到飞腾的VSIPL函数库

#include "vsip.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <math.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <sys/unistd.h>#define MAX_N 4194304
#define VECTOR_NUM 50//views of blocks
vsip_cvview_f *view_x[VECTOR_NUM];
vsip_cvview_f *view_y[VECTOR_NUM];
vsip_cvview_f *view_z[VECTOR_NUM];
vsip_vview_f  *view_mag[VECTOR_NUM];
vsip_vview_f  *view_log[VECTOR_NUM];//user data
vsip_scalar_f *data_x;
vsip_scalar_f *data_y;
vsip_scalar_f *data_z;
vsip_scalar_f *data_mag;
vsip_scalar_f *data_log;//out-of-place fft / ifft plans
vsip_fft_f      *fft_op_plan;
vsip_fft_f      *ifft_op_plan;//in-place fft / ifft plans
vsip_fft_f      *fft_ip_plan;
vsip_fft_f      *ifft_ip_plan;//blocks of user data
vsip_cblock_f *block_x;
vsip_cblock_f *block_y;
vsip_cblock_f *block_z;
vsip_block_f  *block_mag;
vsip_block_f  *block_log;//vsip_randstate 	*state;int     place_mode  =1;//0: in-place; 1:out-of-place
int     test_mode   =1;//0:correction testing;!0:performance testingint 	begin_N		=32*1024;
int 	end_N		=512*1024;
//int 	begin_N		=1*1024;
//int 	end_N		=512*1024;
int 	begin_np	=19;
int 	end_np		=19;int     num_thread	=1;
int 	repeat		=1000;#define TEST_ARGS "pvN:r:n:T:h?"
void scan_test_args(int argc, char *argv[])
{int c;while ((c = getopt(argc, argv, TEST_ARGS)) != EOF){switch(c){case 'p':place_mode=0;break;case 'v':test_mode=0;break;case 'N':sscanf(optarg,"%d:%d",&begin_N,&end_N);break;case 'r':repeat=atoi(optarg);break;case 'n':sscanf(optarg,"%d:%d",&begin_np,&end_np);begin_N=(int)pow(2,(double)begin_np);end_N=(int)pow(2,(double)end_np);break;case 'T':num_thread=atoi(optarg);break;case 'h':case '?':default :printf("Unknown option:");exit(1);break;}}
}void initialize_test_data(int N, int vector_num)
{srand(0);int i,j;for(j=0;j<vector_num;j++){for(i=0;i<N;i++){data_x[2*i+0+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;data_x[2*i+1+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;data_y[2*i+0+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;data_y[2*i+1+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;}}
}void initialize(int N, int vector_num)
{vsip_init(NULL);data_x = malloc(2*vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_x==NULL){printf("malloc failed\n");}memset(data_x,0,2*vector_num*MAX_N);block_x = vsip_cblockbind_f(data_x, NULL, MAX_N*vector_num, VSIP_MEM_NONE);vsip_cblockadmit_f(block_x, 1);data_y= malloc(2*vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_y==NULL){printf("malloc failed\n");}memset(data_y,0,2*vector_num*MAX_N);block_y = vsip_cblockbind_f(data_y, NULL, MAX_N*vector_num, VSIP_MEM_NONE);vsip_cblockadmit_f(block_y, 1);initialize_test_data(N,vector_num);data_z = malloc(2*vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_z==NULL){printf("malloc failed\n");}memset(data_z,0,2*vector_num*MAX_N);block_z = vsip_cblockbind_f(data_z, NULL, MAX_N*vector_num, VSIP_MEM_NONE);vsip_cblockadmit_f(block_z, 1);data_mag = malloc(vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_mag==NULL){printf("malloc failed\n");}memset(data_mag,0, vector_num*MAX_N);block_mag = vsip_blockbind_f(data_mag, MAX_N*vector_num, VSIP_MEM_NONE);vsip_blockadmit_f(block_mag, 1);data_log = malloc(vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_log==NULL){printf("malloc failed\n");}memset(data_log,0, vector_num*MAX_N);block_log = vsip_blockbind_f(data_log, MAX_N*vector_num, VSIP_MEM_NONE);vsip_blockadmit_f(block_log, 1);int i;for(i=0;i<vector_num;i++){view_x[i] = vsip_cvbind_f(block_x, i*MAX_N, 1, N);view_y[i] = vsip_cvbind_f(block_y, i*MAX_N, 1, N);view_z[i] = vsip_cvbind_f(block_z, i*MAX_N, 1, N);view_mag[i] = vsip_vbind_f(block_mag, i*MAX_N, 1, N);view_log[i] = vsip_vbind_f(block_log, i*MAX_N, 1, N);}fft_op_plan = vsip_ccfftop_create_f(N,1.0,0,1,1);if(fft_op_plan==NULL){printf("create fft plan failed\n");}ifft_op_plan= vsip_ccfftop_create_f(N,1.0/N,1,1,1);if(ifft_op_plan==NULL){printf("create ifft plan failed\n");}fft_ip_plan = vsip_ccfftip_create_f(N,1.0,0,1,1);if(fft_ip_plan==NULL){printf("create fft plan failed\n");}ifft_ip_plan= vsip_ccfftip_create_f(N,1.0/N,1,1,1);if(ifft_ip_plan==NULL){printf("create ifft plan failed\n");}
}void finalize(int vector_num)
{vsip_scalar_f *ptrx1,*ptrx2;vsip_scalar_f *ptry1,*ptry2;vsip_scalar_f *ptrz1,*ptrz2;vsip_cblockrelease_f(block_x, 1, &ptrx1, &ptrx2);vsip_cblockrelease_f(block_y, 1, &ptry1, &ptry2);vsip_cblockrelease_f(block_z, 1, &ptrz1, &ptrz2);int i;int s;vsip_cblock_f *ptrb_x;vsip_cblock_f *ptrb_y;vsip_cblock_f *ptrb_z;ptrb_x = vsip_cvdestroy_f(view_x[0]);ptrb_y = vsip_cvdestroy_f(view_y[0]);ptrb_z = vsip_cvdestroy_f(view_z[0]);for(i=1;i<vector_num;i++){vsip_cvdestroy_f(view_x[i]);vsip_cvdestroy_f(view_y[i]);vsip_cvdestroy_f(view_z[i]);}vsip_cblockfind_f(ptrb_x, &ptrx1, &ptrx2);vsip_cblockfind_f(ptrb_y, &ptry1, &ptry2);vsip_cblockfind_f(ptrb_z, &ptrz1, &ptrz2);vsip_cblockdestroy_f(block_x);vsip_cblockdestroy_f(block_y);vsip_cblockdestroy_f(block_z);s = vsip_fft_destroy_f(ifft_op_plan);if(s!=0){printf("ifft_destroy ok\n");}s = vsip_fft_destroy_f(fft_op_plan);if(s!=0){printf("fft_destroy failed\n");}s = vsip_fft_destroy_f(ifft_ip_plan);if(s!=0){printf("ifft_destroy ok\n");}s = vsip_fft_destroy_f(fft_ip_plan);if(s!=0){printf("fft_destroy failed\n");}vsip_finalize((void *)0);free(data_x);free(data_y);free(data_z);}int main(int argc, char *argv[])
{scan_test_args(argc, argv);printf("\n\n--------------------------------------start caculating:------------------------------------\n");printf("num_thread = %i, repeat = %i \n\n", num_thread, repeat );int N,repeat_i,i;int j;double time_used_i;double time_used = 0;struct timeval tv_begin,tv_end;int length = 1024;printf("\n\n--------------------------------------FFT TEST-------------------------------------\n");for(j=0; j<12; j++){length = 1024 * (1<<j);initialize( length, VECTOR_NUM);
//FFT: out-of-placetime_used_i = 0;time_used = 0;for (repeat_i = 0;repeat_i<repeat;repeat_i++){gettimeofday(&tv_begin,0);vsip_ccfftop_f( fft_op_plan,view_x[ repeat_i % VECTOR_NUM ],view_y[ repeat_i % VECTOR_NUM ] );gettimeofday(&tv_end,0);time_used_i=(double)(tv_end.tv_sec-tv_begin.tv_sec)*1000000+(double)(tv_end.tv_usec-tv_begin.tv_usec);time_used += time_used_i/repeat;}printf("size:\t%10i\tfft-op\ttime_used:\t%16.3f us\n", length, time_used);finalize(VECTOR_NUM);}printf("\n--------------------------------------end caculating:------------------------------------\n\n");return 0;
}

http://www.dtcms.com/a/404022.html

相关文章:

  • datawhale玩转通义四大新模型 202509 第6次作业
  • 响应式品牌网站wordpress图片文字
  • 电子商城建设网站品牌建设实施纲要
  • 责任链设计模式详解
  • 学习2025.9.24
  • 视频 播放网站怎么做黄石企业网站建设
  • C#图像处理五大核心误区与解决方案‌
  • 珠海做网站公司哪家好国家住房与城乡建设部网站
  • 织梦做网站要多长时间免费那个网站
  • 新桥网站建设济南小程序网站制作
  • 一文读懂循环神经网络(RNN):原理、局限与LSTM解决方案
  • 以绿色为主的网站wordpress 单 中
  • Linux 企业级备份体系实战:cron/anacron/restic/rclone 对比与脚本总结
  • 做的网站太大怎么办神华科技 网站建设
  • 软件公司 网站建设费分录如何创建网站主页
  • 北京城乡和住房建设部网站济源新站seo关键词排名推广
  • 网站页脚的制作西部数码上传网站
  • 素材网站下载网店设计与装修
  • 《2025年AI产业发展十大趋势报告》六十七
  • 花生壳做网站需要备案西安制作标书的公司
  • 笔记本copilot按键映射为右Ctrl键的方法
  • 网站建设与维护高职网易企业邮箱登入路口
  • 廊坊网站建设制作制作企业网站html
  • JavaScript——DOM补充
  • ICPC Central Russia Regional Contest, 2024 补题
  • 【系统架构设计(36)】网络规划与设计
  • SQL 注入风险与解决方案实战解析
  • 企业如何做网站推广出名的包装设计
  • 【STM32】位带操作
  • 供需网站开发做网站怎么发布