天数智芯智铠100性能测试
1、背景介绍
最近新做了一个GPU项目,采用飞腾D3000+MR100组成GPU模块进行运算,MR100就是天数智芯智铠100的产品子卡。这里采用FFT运算对MR100芯片进行测试。
2、测试代码
代码采用cuda编程,和C++类似,如下:
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>
#include <cufft.h>// clang++ GPU_IFFT_test.cu -lcudart -lcufft -L/usr/local/corex/lib64 --cuda-gpu-arch=ivcore11 -o GPU_IFFT_testvoid testFFTandiFFT(int dataSize, int numTests)
{const int batchSize = 1;cudaSetDevice(0); // 选择要使用的GPU设备cufftComplex* d_data;cufftComplex* h_data;cufftHandle forwardPlan, inversePlan;h_data = (cufftComplex*)malloc(dataSize * batchSize * sizeof(cufftComplex));cudaMalloc((void**)&d_data, dataSize * batchSize * sizeof(cufftComplex));// 初始化数据for (int i = 0; i < dataSize * batchSize; ++i) {h_data[i].x = static_cast<float>(rand()) / 256;h_data[i].y = 0.0f;}// 创建FFT计划cufftPlan1d(&forwardPlan, dataSize, CUFFT_C2C, batchSize);cufftPlan1d(&inversePlan, dataSize, CUFFT_C2C, batchSize);// 预热for(int i = 0; i < 10; i++){cufftExecC2C(forwardPlan, d_data, d_data, CUFFT_FORWARD);}// 执行FFT和计时float totalForwardTime = 0.0f;for (int test = 0; test < numTests; ++test) {cudaMemcpy(d_data, h_data, dataSize * batchSize * sizeof(cufftComplex), cudaMemcpyHostToDevice);cudaEvent_t start, stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start);cufftExecC2C(forwardPlan, d_data, d_data, CUFFT_FORWARD);cudaEventRecord(stop);cudaEventSynchronize(stop);float elapsedTime = 0.0f;cudaEventElapsedTime(&elapsedTime, start, stop);totalForwardTime += elapsedTime;cudaMemcpy(h_data, d_data, dataSize * batchSize * sizeof(cufftComplex), cudaMemcpyDeviceToHost);}// 计算FFT平均时间float averageForwardTime = totalForwardTime / numTests;std::cout << "Data size: " << dataSize << std::endl;std::cout << "Average Time for FFT: " << averageForwardTime << " ms" << " total times: " << numTests << std::endl;// 清理内存free(h_data);cudaFree(d_data);cufftDestroy(forwardPlan);cufftDestroy(inversePlan);
}int main()
{const int minDataSize = 64; // 2^6const int maxDataSize = 16777216; // 2^24const int numTests = 1000; // Number of tests to run for each data sizefor (int dataSize = minDataSize; dataSize <= maxDataSize; dataSize *= 2) {testFFTandiFFT(dataSize, numTests);}return 0;
}
代码文件命名为GPU_IFFT_test.cu
安装好天数智芯的驱动和cuda环境,编译命令为:
clang++ GPU_IFFT_test.cu -lcudart -lcufft -L/usr/local/corex/lib64 --cuda-gpu-arch=ivcore11 -o GPU_IFFT_test
这样就可以编译出可执行文件了
3、测试结果
这里列出三种测试结果,前两种分别是MR100在不降频的模式下的结果以及降频模式下的结果,最后一列为D3000上运行FFT的结果,作为对比
序号 | FFT点数大小 | 耗时(未降频1500M) | 耗时(降频1000M) | D3000 |
1 | 64 | 12.3us | 15.6us | |
2 | 128 | 12.5us | 15.6us | |
3 | 256 | 14.7us | 18.4us | |
4 | 512 | 13.5us | 17.6us | |
5 | 1024 | 13.6us | 17.7us | 11.690 us |
6 | 2048 | 14.7us | 19.4us | 24.915 us |
7 | 4096 | 14.9us | 19.7us | 54.451 us |
8 | 8192 | 16.2us | 21.2us | 118.238 us |
9 | 16384 | 20.2us | 27.8us | 293.759 us |
10 | 32768 | 25.5us | 33.5us | 707.804 us |
11 | 65536 | 26.6us | 33.9us | 1459.291 us |
12 | 131072 | 28.1us | 35.2us | 3094.575 us |
13 | 262144 | 32.3us | 42.6us | 7873.657 us |
14 | 524288 | 49.5us | 69.1us | 18128.254 us |
15 | 1048576 | 91.2us | 131.1us | 42575.336 us |
16 | 2097152 | 270.7us | 360.5us | 116483.516 us |
17 | 4194304 | 680.6us | 867.9us | |
18 | 8388608 | 2687.7us | 3640.8us | |
19 | 16777216 | 6778.4us | 9036.9us |
从上述表格可以看出,在FFT点数超过1K点后,GPU的FFT性能明显强于CPU。虽然D3000主频是2.5G,比GPU要高,但GPU明显更适合进行FFT这种并发性高的计算。
附:D3000上面运行的FFT代码如下,这里要用到飞腾的VSIPL函数库
#include "vsip.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include <math.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <sys/unistd.h>#define MAX_N 4194304
#define VECTOR_NUM 50//views of blocks
vsip_cvview_f *view_x[VECTOR_NUM];
vsip_cvview_f *view_y[VECTOR_NUM];
vsip_cvview_f *view_z[VECTOR_NUM];
vsip_vview_f *view_mag[VECTOR_NUM];
vsip_vview_f *view_log[VECTOR_NUM];//user data
vsip_scalar_f *data_x;
vsip_scalar_f *data_y;
vsip_scalar_f *data_z;
vsip_scalar_f *data_mag;
vsip_scalar_f *data_log;//out-of-place fft / ifft plans
vsip_fft_f *fft_op_plan;
vsip_fft_f *ifft_op_plan;//in-place fft / ifft plans
vsip_fft_f *fft_ip_plan;
vsip_fft_f *ifft_ip_plan;//blocks of user data
vsip_cblock_f *block_x;
vsip_cblock_f *block_y;
vsip_cblock_f *block_z;
vsip_block_f *block_mag;
vsip_block_f *block_log;//vsip_randstate *state;int place_mode =1;//0: in-place; 1:out-of-place
int test_mode =1;//0:correction testing;!0:performance testingint begin_N =32*1024;
int end_N =512*1024;
//int begin_N =1*1024;
//int end_N =512*1024;
int begin_np =19;
int end_np =19;int num_thread =1;
int repeat =1000;#define TEST_ARGS "pvN:r:n:T:h?"
void scan_test_args(int argc, char *argv[])
{int c;while ((c = getopt(argc, argv, TEST_ARGS)) != EOF){switch(c){case 'p':place_mode=0;break;case 'v':test_mode=0;break;case 'N':sscanf(optarg,"%d:%d",&begin_N,&end_N);break;case 'r':repeat=atoi(optarg);break;case 'n':sscanf(optarg,"%d:%d",&begin_np,&end_np);begin_N=(int)pow(2,(double)begin_np);end_N=(int)pow(2,(double)end_np);break;case 'T':num_thread=atoi(optarg);break;case 'h':case '?':default :printf("Unknown option:");exit(1);break;}}
}void initialize_test_data(int N, int vector_num)
{srand(0);int i,j;for(j=0;j<vector_num;j++){for(i=0;i<N;i++){data_x[2*i+0+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;data_x[2*i+1+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;data_y[2*i+0+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;data_y[2*i+1+2*j*MAX_N] = (float)rand()/(float)RAND_MAX ;}}
}void initialize(int N, int vector_num)
{vsip_init(NULL);data_x = malloc(2*vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_x==NULL){printf("malloc failed\n");}memset(data_x,0,2*vector_num*MAX_N);block_x = vsip_cblockbind_f(data_x, NULL, MAX_N*vector_num, VSIP_MEM_NONE);vsip_cblockadmit_f(block_x, 1);data_y= malloc(2*vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_y==NULL){printf("malloc failed\n");}memset(data_y,0,2*vector_num*MAX_N);block_y = vsip_cblockbind_f(data_y, NULL, MAX_N*vector_num, VSIP_MEM_NONE);vsip_cblockadmit_f(block_y, 1);initialize_test_data(N,vector_num);data_z = malloc(2*vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_z==NULL){printf("malloc failed\n");}memset(data_z,0,2*vector_num*MAX_N);block_z = vsip_cblockbind_f(data_z, NULL, MAX_N*vector_num, VSIP_MEM_NONE);vsip_cblockadmit_f(block_z, 1);data_mag = malloc(vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_mag==NULL){printf("malloc failed\n");}memset(data_mag,0, vector_num*MAX_N);block_mag = vsip_blockbind_f(data_mag, MAX_N*vector_num, VSIP_MEM_NONE);vsip_blockadmit_f(block_mag, 1);data_log = malloc(vector_num*MAX_N*sizeof(vsip_scalar_f));if(data_log==NULL){printf("malloc failed\n");}memset(data_log,0, vector_num*MAX_N);block_log = vsip_blockbind_f(data_log, MAX_N*vector_num, VSIP_MEM_NONE);vsip_blockadmit_f(block_log, 1);int i;for(i=0;i<vector_num;i++){view_x[i] = vsip_cvbind_f(block_x, i*MAX_N, 1, N);view_y[i] = vsip_cvbind_f(block_y, i*MAX_N, 1, N);view_z[i] = vsip_cvbind_f(block_z, i*MAX_N, 1, N);view_mag[i] = vsip_vbind_f(block_mag, i*MAX_N, 1, N);view_log[i] = vsip_vbind_f(block_log, i*MAX_N, 1, N);}fft_op_plan = vsip_ccfftop_create_f(N,1.0,0,1,1);if(fft_op_plan==NULL){printf("create fft plan failed\n");}ifft_op_plan= vsip_ccfftop_create_f(N,1.0/N,1,1,1);if(ifft_op_plan==NULL){printf("create ifft plan failed\n");}fft_ip_plan = vsip_ccfftip_create_f(N,1.0,0,1,1);if(fft_ip_plan==NULL){printf("create fft plan failed\n");}ifft_ip_plan= vsip_ccfftip_create_f(N,1.0/N,1,1,1);if(ifft_ip_plan==NULL){printf("create ifft plan failed\n");}
}void finalize(int vector_num)
{vsip_scalar_f *ptrx1,*ptrx2;vsip_scalar_f *ptry1,*ptry2;vsip_scalar_f *ptrz1,*ptrz2;vsip_cblockrelease_f(block_x, 1, &ptrx1, &ptrx2);vsip_cblockrelease_f(block_y, 1, &ptry1, &ptry2);vsip_cblockrelease_f(block_z, 1, &ptrz1, &ptrz2);int i;int s;vsip_cblock_f *ptrb_x;vsip_cblock_f *ptrb_y;vsip_cblock_f *ptrb_z;ptrb_x = vsip_cvdestroy_f(view_x[0]);ptrb_y = vsip_cvdestroy_f(view_y[0]);ptrb_z = vsip_cvdestroy_f(view_z[0]);for(i=1;i<vector_num;i++){vsip_cvdestroy_f(view_x[i]);vsip_cvdestroy_f(view_y[i]);vsip_cvdestroy_f(view_z[i]);}vsip_cblockfind_f(ptrb_x, &ptrx1, &ptrx2);vsip_cblockfind_f(ptrb_y, &ptry1, &ptry2);vsip_cblockfind_f(ptrb_z, &ptrz1, &ptrz2);vsip_cblockdestroy_f(block_x);vsip_cblockdestroy_f(block_y);vsip_cblockdestroy_f(block_z);s = vsip_fft_destroy_f(ifft_op_plan);if(s!=0){printf("ifft_destroy ok\n");}s = vsip_fft_destroy_f(fft_op_plan);if(s!=0){printf("fft_destroy failed\n");}s = vsip_fft_destroy_f(ifft_ip_plan);if(s!=0){printf("ifft_destroy ok\n");}s = vsip_fft_destroy_f(fft_ip_plan);if(s!=0){printf("fft_destroy failed\n");}vsip_finalize((void *)0);free(data_x);free(data_y);free(data_z);}int main(int argc, char *argv[])
{scan_test_args(argc, argv);printf("\n\n--------------------------------------start caculating:------------------------------------\n");printf("num_thread = %i, repeat = %i \n\n", num_thread, repeat );int N,repeat_i,i;int j;double time_used_i;double time_used = 0;struct timeval tv_begin,tv_end;int length = 1024;printf("\n\n--------------------------------------FFT TEST-------------------------------------\n");for(j=0; j<12; j++){length = 1024 * (1<<j);initialize( length, VECTOR_NUM);
//FFT: out-of-placetime_used_i = 0;time_used = 0;for (repeat_i = 0;repeat_i<repeat;repeat_i++){gettimeofday(&tv_begin,0);vsip_ccfftop_f( fft_op_plan,view_x[ repeat_i % VECTOR_NUM ],view_y[ repeat_i % VECTOR_NUM ] );gettimeofday(&tv_end,0);time_used_i=(double)(tv_end.tv_sec-tv_begin.tv_sec)*1000000+(double)(tv_end.tv_usec-tv_begin.tv_usec);time_used += time_used_i/repeat;}printf("size:\t%10i\tfft-op\ttime_used:\t%16.3f us\n", length, time_used);finalize(VECTOR_NUM);}printf("\n--------------------------------------end caculating:------------------------------------\n\n");return 0;
}