CUDA学习--体验GPU性能
学习来源:2 CUDA Python--并行计算基础-卷积计算以及共享内存_哔哩哔哩_bilibili
处理一张图片的处理速度对比
import cv2
from numba import cuda
import time
import math
@cuda.jit()
def process_gpu(img,channels):
tx = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x
ty = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y
# tx和ty计算了每个线程在图像中的位置。
# cuda.blockIdx.x和cuda.blockIdx.y分别是当前线程块在网格中的x和y坐标。
# cuda.blockDim.x和cuda.blockDim.y分别是当前线程块中线程的x和y维度的大小,也就是规定的线程块在y和x方向上所有的线程数
# cuda.threadIdx.x和cuda.threadIdx.y分别是当前线程在线程块中的x和y坐标,也就是在x和y坐标下有多少个线程
for c in range(channels):
color = img[tx, ty][c] * 2.0 + 30
if color > 255:
img[tx, ty][c] = 255
elif color < 0:
img[tx, ty][c] = 0
else:
img[tx, ty][c] = color
def process_cpu(img,dst):
rows,cols,channels= img.shape
for i in range(rows):
for j in range(cols):
for c in range(3):
color=img[i,j][c]*2.0+30
if color>255:
dst[i,j][c]=255
elif color<0:
dst[i,j][c]=0
else:
dst[i,j][c]=color
if __name__ == "__main__":
img = cv2.imread('test.jpg')
rows,cols,channels = img.shape
#rows: 图像的高度,表示图像中的行数。
#cols: 图像的宽度,表示图像中的列数。
dst_cpu = img.copy()
dst_gpu = img.copy()
start_cpu = time.time()#获得时间戳
process_cpu(img,dst_cpu)
end_cpu =time.time()
time_cpu=(end_cpu-start_cpu)
#print("cpu process time:"+ str(time_cpu))
#GPU funtion
dimg = cuda.to_device(img)#将图片数据传给gpu
threadsperblock = (16,16)#threadsperblock 定义了每个线程块中的线程数。
blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))# 计算在x方向上需要多少个线程块。
blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))# 计算在Y方向上需要多少个线程块。
#它的作用是返回大于或等于给定浮点数的最小整数。简而言之,就是对一个数进行“向上取整”。
blockspergrid = (blockspergrid_x,blockspergrid_y)
cuda.synchronize()
start_gpu =time.time()
process_gpu[blockspergrid,threadsperblock](dimg,channels)#主要代码,其中指明了规定的网格和块数量
cuda.synchronize()
end_gpu = time.time()
dst_gpu = dimg.copy_to_host()
time_gpu = (end_gpu-start_gpu)
#save
cv2.imwrite("rusult.jpg",dst_cpu)
cv2.imwrite("result2.jpg",dst_gpu)
print(time_gpu)
print(time_cpu)
处理一个矩阵,分别用cpu ,gpu,以及gpu的共享内存处理速度对比
from numba import cuda,float32
import numba
import numpy
import math
import time
TPB=16
@numba.jit(nopython=True)
def matmul_cpu(A,B,C):
for y in range(B.shape[1]):
for x in range(A.shape[0]):
tmp=0
for k in range(A.shape[1]):
tmp +=A[x,k]*B[k,y]
C[x,y]= tmp
@cuda.jit
def matmul_gpu(A,B,C):
row , col = cuda.grid(2)
#cuda.grid(2)用于在二维网格中获取当前线程的行和列索引
if row <C.shape[0] and col < C.shape[1]:
tmp =0
for k in range(A.shape[1]):
tmp+=A[row,k]*B[k,col]
C[row ,col] = tmp
@cuda.jit
def matmul_shard_men(A,B,C):
sA = cuda.shared.array(shape=(TPB,TPB),dtype=float32)
sB = cuda.shared.array(shape=(TPB,TPB),dtype=float32)
x,y = cuda.grid(2)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
if x>=C.shape[0] and y>= C.shape[1]:
return
tmp = 0
for i in range(int(A.shape[1]/TPB)):
sA[tx,ty] = A[x,ty+i*TPB]
sB[tx,ty] = B[tx+i*TPB,y]
cuda.syncthreads()
for j in range(TPB):
tmp += sA[tx,j]*sB[j,ty]
cuda.syncthreads()
C[x,y] = tmp
A = numpy.full((TPB * 200,TPB * 100),3,numpy.float64)
B = numpy.full((TPB * 100,TPB * 200), 4,numpy.float64)
C_cpu = numpy.full((A.shape[0], B.shape[1]), 0, numpy.float64)
print("Start processing in CPU")
start_cpu = time.time()
matmul_cpu(A,B,C_cpu)
end_cpu = time.time()
time_cpu = (end_cpu-start_cpu)
#Start in gpu
A_global_men = cuda.to_device(A)
B_global_men = cuda.to_device(B)
C_global_men = cuda.device_array((A.shape[0],B.shape[1]))
C_shared_men = cuda.device_array((A.shape[0],B.shape[1]))
threadsperblock = (TPB,TPB)
blockspergrid_x = int(math.ceil(A.shape[0]/threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x,blockspergrid_y)
print("start processing in GPU")
start_gpu = time.time()
matmul_gpu[blockspergrid,threadsperblock](A_global_men,B_global_men,C_global_men)
cuda.synchronize()
end_gpu = time.time()
time_gpu = (end_gpu-start_gpu)
print(time_cpu)
print(time_gpu)
C_global_gpu = C_global_men.copy_to_host()
start_gpu_shared = time.time()
matmul_shard_men[blockspergrid,threadsperblock](A_global_men,B_global_men,C_shared_men)
cuda.synchronize()
end_gpu_shared = time.time()
time_gpu_shared = (end_gpu_shared-start_gpu_shared)
print(time_gpu_shared)
C_shared_gpu = C_shared_men.copy_to_host()