当前位置：首页 > news >正文

CUDA学习--体验GPU性能

news 2025/7/6 22:59:05

学习来源：2 CUDA Python--并行计算基础-卷积计算以及共享内存_哔哩哔哩_bilibili

处理一张图片的处理速度对比

import cv2
from numba import  cuda
import time
import math
@cuda.jit()
def process_gpu(img,channels):
    tx = cuda.blockIdx.x*cuda.blockDim.x+cuda.threadIdx.x
    ty = cuda.blockIdx.y*cuda.blockDim.y + cuda.threadIdx.y
    # tx和ty计算了每个线程在图像中的位置。
    # cuda.blockIdx.x和cuda.blockIdx.y分别是当前线程块在网格中的x和y坐标。
    # cuda.blockDim.x和cuda.blockDim.y分别是当前线程块中线程的x和y维度的大小，也就是规定的线程块在y和x方向上所有的线程数
    # cuda.threadIdx.x和cuda.threadIdx.y分别是当前线程在线程块中的x和y坐标，也就是在x和y坐标下有多少个线程
    for c in range(channels):
        color = img[tx, ty][c] * 2.0 + 30
        if color > 255:
            img[tx, ty][c] = 255
        elif color < 0:
            img[tx, ty][c] = 0
        else:
            img[tx, ty][c] = color
def process_cpu(img,dst):
    rows,cols,channels= img.shape
    for i in range(rows):
        for j in range(cols):
            for c in range(3):
                color=img[i,j][c]*2.0+30
                if color>255:
                    dst[i,j][c]=255
                elif color<0:
                    dst[i,j][c]=0
                else:
                    dst[i,j][c]=color



if __name__ == "__main__":
    img = cv2.imread('test.jpg')
    rows,cols,channels = img.shape
    #rows: 图像的高度，表示图像中的行数。
    #cols: 图像的宽度，表示图像中的列数。
    dst_cpu = img.copy()
    dst_gpu = img.copy()
    start_cpu = time.time()#获得时间戳
    process_cpu(img,dst_cpu)
    end_cpu =time.time()
    time_cpu=(end_cpu-start_cpu)
    #print("cpu process time:"+ str(time_cpu))

    #GPU funtion
    dimg = cuda.to_device(img)#将图片数据传给gpu
    threadsperblock = (16,16)#threadsperblock 定义了每个线程块中的线程数。
    blockspergrid_x = int(math.ceil(rows/threadsperblock[0]))# 计算在x方向上需要多少个线程块。
    blockspergrid_y = int(math.ceil(cols/threadsperblock[1]))# 计算在Y方向上需要多少个线程块。
    #它的作用是返回大于或等于给定浮点数的最小整数。简而言之，就是对一个数进行“向上取整”。
    blockspergrid = (blockspergrid_x,blockspergrid_y)
    cuda.synchronize()
    start_gpu =time.time()
    process_gpu[blockspergrid,threadsperblock](dimg,channels)#主要代码，其中指明了规定的网格和块数量
    cuda.synchronize()
    end_gpu = time.time()
    dst_gpu = dimg.copy_to_host()
    time_gpu = (end_gpu-start_gpu)

    #save
    cv2.imwrite("rusult.jpg",dst_cpu)
    cv2.imwrite("result2.jpg",dst_gpu)
    print(time_gpu)
    print(time_cpu)

处理一个矩阵，分别用cpu ,gpu,以及gpu的共享内存处理速度对比

from numba import cuda,float32
import numba
import  numpy
import  math
import  time

TPB=16
@numba.jit(nopython=True)
def matmul_cpu(A,B,C):
    for y in range(B.shape[1]):
        for x in range(A.shape[0]):
            tmp=0
            for k in range(A.shape[1]):
                tmp +=A[x,k]*B[k,y]
            C[x,y]= tmp

@cuda.jit
def matmul_gpu(A,B,C):
    row , col = cuda.grid(2)
    #cuda.grid(2)用于在二维网格中获取当前线程的行和列索引
    if row <C.shape[0] and col < C.shape[1]:
        tmp =0
        for k in range(A.shape[1]):
            tmp+=A[row,k]*B[k,col]
        C[row ,col] = tmp

@cuda.jit
def matmul_shard_men(A,B,C):
    sA = cuda.shared.array(shape=(TPB,TPB),dtype=float32)
    sB = cuda.shared.array(shape=(TPB,TPB),dtype=float32)
    x,y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    if x>=C.shape[0] and y>= C.shape[1]:
        return

    tmp = 0
    for i in range(int(A.shape[1]/TPB)):
        sA[tx,ty] = A[x,ty+i*TPB]
        sB[tx,ty] = B[tx+i*TPB,y]
        cuda.syncthreads()
        for j in range(TPB):
            tmp += sA[tx,j]*sB[j,ty]
        cuda.syncthreads()
    C[x,y] = tmp

A = numpy.full((TPB * 200,TPB * 100),3,numpy.float64)
B = numpy.full((TPB * 100,TPB * 200), 4,numpy.float64)
C_cpu = numpy.full((A.shape[0], B.shape[1]), 0, numpy.float64)

print("Start processing in CPU")
start_cpu = time.time()
matmul_cpu(A,B,C_cpu)
end_cpu = time.time()
time_cpu = (end_cpu-start_cpu)

#Start in gpu
A_global_men = cuda.to_device(A)
B_global_men = cuda.to_device(B)

C_global_men = cuda.device_array((A.shape[0],B.shape[1]))
C_shared_men = cuda.device_array((A.shape[0],B.shape[1]))

threadsperblock = (TPB,TPB)
blockspergrid_x = int(math.ceil(A.shape[0]/threadsperblock[0]))
blockspergrid_y = int(math.ceil(B.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x,blockspergrid_y)

print("start processing in GPU")
start_gpu = time.time()
matmul_gpu[blockspergrid,threadsperblock](A_global_men,B_global_men,C_global_men)
cuda.synchronize()
end_gpu = time.time()
time_gpu = (end_gpu-start_gpu)


print(time_cpu)
print(time_gpu)
C_global_gpu = C_global_men.copy_to_host()

start_gpu_shared = time.time()
matmul_shard_men[blockspergrid,threadsperblock](A_global_men,B_global_men,C_shared_men)
cuda.synchronize()
end_gpu_shared = time.time()


time_gpu_shared = (end_gpu_shared-start_gpu_shared)
print(time_gpu_shared)
C_shared_gpu = C_shared_men.copy_to_host()

查看全文

http://www.dtcms.com/a/113081.html