CUDA向量相加
#include <iostream>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 1000000
//cpu
void vector_add_cpu(int* a, int* b, int* c, int n)
{
for (int i = 0; i < n; ++i)
{
c[i] = a[i] + b[i];
}
}
//1、单block单thread向量加
__global__ void vector_add_gpu_1(int* a, int* b, int* c, int n)
{
for (int i = 0; i < n; ++i)
{
c[i] = a[i] + b[i];
}
}
//2、单block多thread向量加
__global__ void vector_add_gpu_2(int* a, int* b, int* c, int n)
{
int tid = threadIdx.x; //线程索引号
const int t_n = blockDim.x; // 一个block内的线程总数
while (tid < n)
{
c[tid] = a[tid] + b[tid];
tid += t_n;
}
}
//3、多block多thread向量加
__global__ void vector_add_gpu_3(int* a, int* b, int* c, int n)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x; // 获取线程索引
const int t_n = gridDim.x * blockDim.x; // 跳步的步长,所有线程的数量
//printf("gridDim.x = %d\n", gridDim.x);
//printf("blockDim.x = %d\n", blockDim.x);
//printf("t_n = %d\n", t_n);
while (tid < n)
{
c[tid] = a[tid] + b[tid];
tid += t_n;
}
}
int main()
{
int a[N], b[N], c[N];
int* dev_a, * dev_b, * dev_c;
for (int i = 0; i < N; ++i) // 为数组a、b赋值
{
a[i] = i;
b[i] = i;
}
cudaMalloc(&dev_a, sizeof(int) * N);
cudaMemcpy(dev_a, a, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc(&dev_b, sizeof(int) * N);
cudaMemcpy(dev_b, b, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc(&dev_c, sizeof(int) * N);
cudaMemcpy(dev_c, c, sizeof(int) * N, cudaMemcpyHostToDevice);
clock_t t = clock();
int loops = 100;
for (size_t i = 0; i < loops; i++)
{
vector_add_cpu(a, b, c, N);
}
clock_t t0 = clock();
std::cout << t0 - t << "ms" << std::endl;
for (size_t i = 0; i < loops; i++)
{
vector_add_gpu_1 << <1, 1 >> > (dev_a, dev_b, dev_c, N);
}
clock_t t1 = clock();
std::cout << t1 - t0 << "ms" << std::endl;
for (size_t i = 0; i < loops; i++)
{
vector_add_gpu_2 << <1, 4 >> > (dev_a, dev_b, dev_c, N);
}
clock_t t2 = clock();
std::cout << t2 - t1 << "ms" << std::endl;
for (size_t i = 0; i < loops; i++)
{
vector_add_gpu_3 << <2, 4 >> > (dev_a, dev_b, dev_c, N);
}
clock_t t3 = clock();
std::cout << t3 - t2 << "ms" << std::endl;
cudaMemcpy(c, dev_c, sizeof(int) * N, cudaMemcpyDeviceToHost);
//for (int i = 0; i < N; ++i)
//{
// printf("%d + %d = %d \n", a[i], b[i], c[i]);
//}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
调用cublas库:
#include <iostream>
#include <time.h>
#include "cublas_v2.h"
#include "cuda_runtime.h"
#define N 1000000
int main()
{
float a[N], b[N], c[N];
float* dev_a, * dev_b, * dev_c;
for (int i = 0; i < N; ++i) // 为数组a、b赋值
{
float tmp = 1.0 * i;
a[i] = tmp;
b[i] = tmp;
}
cublasHandle_t handle; // 申明句柄
cublasCreate_v2(&handle); // 创建句柄
cudaMalloc(&dev_a, sizeof(float) * N);
cudaMalloc(&dev_b, sizeof(float) * N);
float alpha = 1.0;
cublasSetVector(N, sizeof(float), a, 1, dev_a, 1); // H2D host to device
cublasSetVector(N, sizeof(float), b, 1, dev_b, 1);
clock_t t0 = clock();
for (size_t i = 0; i < 10000; i++)
{
cublasSaxpy_v2(handle, N, &alpha, dev_a, 1, dev_b, 1); //实现向量+
}
clock_t t1 = clock();
std::cout << t1 - t0 << "ms" << std::endl;
cublasGetVector(N, sizeof(float), dev_b, 1, c, 1); // D2H
cudaFree(dev_a);
cudaFree(dev_b);
cublasDestroy(handle); // 销毁句柄
//for (int i = 0; i < N; ++i)
//{
// printf("%f + %f = %f \n", a[i], b[i], c[i]);
//}
return 0;
}
CUDA向量内积
#include <stdio.h>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
const int N = 2048;
const int threadnum = 32;//开32个线程
/* cpu 向量内积 */
template <typename T>
void dot_cpu(T* a, T* b, T* c, int n)
{
double dTemp = 0;
for (int i = 0; i < n; ++i)
{
dTemp += a[i] * b[i];
}
*c = dTemp;
}
/*单block 分散归约 */
template <typename T>
__global__ void dot_gpu_1(T* a, T* b, T* c, int n)
{
__shared__ T tmp[threadnum];
const int tid = threadIdx.x; //线程ID索引号
const int t_n = blockDim.x; // 一个block内开启的线程总数
int nTid = tid;
double dTemp = 0.0;
while (nTid < n)
{
dTemp += a[nTid] * b[nTid];
nTid += t_n;
}
tmp[tid] = dTemp; // 将每个线程中的内积放入到共享内存中
__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
int i = 2, j = 1;
while (i <= threadnum)
{
if (tid % i == 0)
{
tmp[tid] += tmp[tid + j];
}
__syncthreads();
i *= 2;
j *= 2;
}
if (0 == tid)
{
c[0] = tmp[0];
}
}
/*单block 低线程归约向量内积*/
template <typename T>
__global__ void dot_gpu_2(T* a, T* b, T* c, int n)
{
__shared__ T tmp[threadnum];
const int nThreadIdX = threadIdx.x; //线程ID索引号
const int nBlockDimX = blockDim.x; // 一个block内开启的线程总数
int nTid = nThreadIdX;
double dTemp = 0.0;
while (nTid < n)
{
dTemp += a[nTid] * b[nTid];
nTid += nBlockDimX;
}
tmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到共享内存中
__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
int i = threadnum / 2;
while (i != 0)
{
if (nThreadIdX < i)
{
tmp[nThreadIdX] += tmp[nThreadIdX + i];
}
__syncthreads();// 同步操作,即等所有线程内上面的操作都执行完
i /= 2;
}
if (0 == nThreadIdX)
{
c[0] = tmp[0];
}
}
/*多block多线程向量内积*/
template <typename T>
__global__ void dot_gpu_3(T* a, T* b, T* c, int n)
{
__shared__ T aTmp[threadnum];
const int nThreadIdX = threadIdx.x; //线程ID索引号
const int nStep = gridDim.x * blockDim.x; // 跳步的步长,即所有线程的数量
int nTidIdx = blockIdx.x * blockDim.x + threadIdx.x; // 当前线程在全局线程的索引
double dTemp = 0.0;
while (nTidIdx < n)
{
dTemp += a[nTidIdx] * b[nTidIdx];
nTidIdx += nStep;
}
aTmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到对应block的共享内存中
__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
int i = threadnum / 2;
while (i != 0)
{
if (nThreadIdX < i)
{
aTmp[nThreadIdX] += aTmp[nThreadIdX + i];
}
__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
i /= 2;
}
if (0 == nThreadIdX)
{
c[blockIdx.x] = aTmp[0];
}
}
int main()
{
float a[N], b[N];
float c = 0;
for (int i = 0; i < N; ++i) // 为数组a、b赋值
{
a[i] = i * 1.0;
b[i] = 1.0;
}
float* d_a = 0, * d_b = 0, * d_c = 0;
cudaMalloc(&d_a, N * sizeof(float));
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_b, N * sizeof(float));
cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_c, sizeof(float));
dot_cpu(a, b, &c, N);
//dot_gpu_1 << <1, threadnum >> > (d_a, d_b, d_c, N);
//dot_gpu_2 << <1, threadnum >> > (d_a, d_b, d_c, N);
//dot_gpu_3<< <1, threadnum >> > (d_a, d_b, d_c, N);
//cudaMemcpy(&c, d_c, sizeof(float), cudaMemcpyDeviceToHost);
std::cout << c << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}