共享内存与Thread的同步
-
给出3072*3072大小的数组, 每一个元素都是整数, 现在要做的就是, 将每个元素的立方相加, 并求出最终的结果. 首先,我们先用PyCuda基础知识写出来一个可以运行的程序.
import time import numpy as np import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void sumOfSquares(int* num, int *result, size_t N) { int index = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; int sum = 0; for (int i = index; i < N; i += stride) { sum += num[i]*num[i]*num[i]; } result[index] = sum; } """) def test(N, np_seed): np.random.seed(np_seed) a = np.random.randint(1, 10, N) N = np.int32(N) thread_size = 256 # block_size = int((N + thread_size - 1) / thread_size) # power = len(str(block_size)) - 2 # block_size = int(block_size / (10**power)) block_size = 32 b = np.empty(thread_size*block_size, dtype=np.int32) sumOfSquares = mod