前言
cuda标准库简单示例
1、Thrust
以前缀和作为例子
1.1 vector 实现
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <stdio.h>
int main(void)
{
int N = 10;
thrust::device_vector<int> x(N, 0);
thrust::device_vector<int> y(N, 0);
for (int i = 0; i < x.size(); ++i)
{
x[i] = i + 1;
}
thrust::inclusive_scan(x.begin(), x.end(), y.begin());
for (int i = 0; i < y.size(); ++i)
{
printf("%d ", (int) y[i]);
}
printf("\n");
return 0;
}
1.2 指针
#include <thrust/execution_policy.h>
#include <thrust/scan.h>
#include <stdio.h>
int main(void)
{
int N = 10;
int *x, *y;
cudaMalloc((void **)&x, sizeof(int) * N);
cudaMalloc((void **)&y, sizeof(int) * N);
int *h_x = (int*) malloc(sizeof(int) * N);
for (int i = 0; i < N; ++i)
{
h_x[i] = i + 1;
}
cudaMemcpy(x, h_x, sizeof(int) * N, cudaMemcpyHostToDevice);
thrust::inclusive_scan(thrust::device, x, x + N, y);
int *h_y = (int*) malloc(sizeof(int) * N);
cudaMemcpy(h_y, y, sizeof(int) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i)
{
printf("%d ", h_y[i]);
}
printf("\n");
cudaFree(x);
cudaFree(y);
free(h_x);
free(h_y);
return 0;
}
2、cuBLAS
矩阵乘法
#include "error.cuh"
#include <stdio.h>
#include <cublas_v2.h>
void print_matrix(int R, int C, double* A, const char* name);
int main(void)
{
int M = 2;
int K = 3;
int N = 2;
int MK = M * K;
int KN = K * N;
int MN = M * N;
double *h_A = (double*) malloc(sizeof(double) * MK);
double *h_B = (double*) malloc(sizeof(double) * KN);
double *h_C = (double*) malloc(sizeof(double) * MN);
for (int i = 0; i < MK; i++)
{
h_A[i] = i;
}
print_matrix(M, K, h_A, "A");
for (int i = 0; i < KN; i++)
{
h_B[i] = i;
}
print_matrix(K, N, h_B, "B");
for (int i = 0; i < MN; i++)
{
h_C[i] = 0;
}
double *g_A, *g_B, *g_C;
CHECK(cudaMalloc((void **)&g_A, sizeof(double) * MK));
CHECK(cudaMalloc((void **)&g_B, sizeof(double) * KN));
CHECK(cudaMalloc((void **)&g_C, sizeof(double) * MN));
cublasSetVector(MK, sizeof(double), h_A, 1, g_A, 1);
cublasSetVector(KN, sizeof(double), h_B, 1, g_B, 1);
cublasSetVector(MN, sizeof(double), h_C, 1, g_C, 1);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha = 1.0;
double beta = 0.0;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K, &alpha, g_A, M, g_B, K, &beta, g_C, M);
cublasDestroy(handle);
cublasGetVector(MN, sizeof(double), g_C, 1, h_C, 1);
print_matrix(M, N, h_C, "C = A x B");
free(h_A);
free(h_B);
free(h_C);
CHECK(cudaFree(g_A));
CHECK(cudaFree(g_B));
CHECK(cudaFree(g_C));
return 0;
}
void print_matrix(int R, int C, double* A, const char* name)
{
printf("%s = \n", name);
for (int r = 0; r < R; ++r)
{
for (int c = 0; c < C; ++c)
{
printf("%10.6f", A[c * R + r]);
}
printf("\n");
}
}
3、cuSolver
矩阵本征值
#include "error.cuh"
#include <stdio.h>
#include <stdlib.h>
#include <cusolverDn.h>
int main(void)
{
int N = 2;
int N2 = N * N;
cuDoubleComplex *A_cpu = (cuDoubleComplex *)
malloc(sizeof(cuDoubleComplex) * N2);
for (int n = 0; n < N2; ++n)
{
A_cpu[0].x = 0;
A_cpu[1].x = 0;
A_cpu[2].x = 0;
A_cpu[3].x = 0;
A_cpu[0].y = 0;
A_cpu[1].y = 1;
A_cpu[2].y = -1;
A_cpu[3].y = 0;
}
cuDoubleComplex *A;
CHECK(cudaMalloc((void**)&A, sizeof(cuDoubleComplex) * N2));
CHECK(cudaMemcpy(A, A_cpu, sizeof(cuDoubleComplex) * N2,
cudaMemcpyHostToDevice));
double *W_cpu = (double*) malloc(sizeof(double) * N);
double *W;
CHECK(cudaMalloc((void**)&W, sizeof(double) * N));
cusolverDnHandle_t handle = NULL;
cusolverDnCreate(&handle);
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
int lwork = 0;
cusolverDnZheevd_bufferSize(handle, jobz, uplo,
N, A, N, W, &lwork);
cuDoubleComplex* work;
CHECK(cudaMalloc((void**)&work,
sizeof(cuDoubleComplex) * lwork));
int* info;
CHECK(cudaMalloc((void**)&info, sizeof(int)));
cusolverDnZheevd(handle, jobz, uplo, N, A, N, W,
work, lwork, info);
cudaMemcpy(W_cpu, W, sizeof(double) * N,
cudaMemcpyDeviceToHost);
printf("Eigenvalues are:\n");
for (int n = 0; n < N; ++n)
{
printf("%g\n", W_cpu[n]);
}
cusolverDnDestroy(handle);
free(A_cpu);
free(W_cpu);
CHECK(cudaFree(A));
CHECK(cudaFree(W));
CHECK(cudaFree(work));
CHECK(cudaFree(info));
return 0;
}
4、cuRAND
4.1 均匀分布
#include <stdio.h>
#include <stdlib.h>
#include <curand.h>
void output_results(int N, double *g_x);
int main(void)
{
curandGenerator_t generator;
curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(generator, 1234);
int N = 100000;
double *g_x; cudaMalloc((void **)&g_x, sizeof(double) * N);
curandGenerateUniformDouble(generator, g_x, N);
double *x = (double*) calloc(N, sizeof(double));
cudaMemcpy(x, g_x, sizeof(double) * N, cudaMemcpyDeviceToHost);
cudaFree(g_x);
output_results(N, x);
free(x);
return 0;
}
void output_results(int N, double *x)
{
FILE *fid = fopen("x1.txt", "w");
for(int n = 0; n < N; n++)
{
fprintf(fid, "%g\n", x[n]);
}
fclose(fid);
}
4.2 正态分布
#include <stdio.h>
#include <stdlib.h>
#include <curand.h>
void output_results(int N, double *g_x);
int main(void)
{
curandGenerator_t generator;
curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(generator, 1234);
int N = 100000;
double *g_x; cudaMalloc((void **)&g_x, sizeof(double) * N);
curandGenerateNormalDouble(generator, g_x, N, 0.0, 1.0);
double *x = (double*) calloc(N, sizeof(double));
cudaMemcpy(x, g_x, sizeof(double) * N, cudaMemcpyDeviceToHost);
cudaFree(g_x);
output_results(N, x);
free(x);
return 0;
}
void output_results(int N, double *x)
{
FILE *fid = fopen("x2.txt", "w");
for(int n = 0; n < N; n++)
{
fprintf(fid, "%g\n", x[n]);
}
fclose(fid);
}