一、硬件信息查询:
#include <stdio.h>
int main() {
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i=0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
return 0;
}
二、错误处置:
1、代码段一:
#include <stdio.h>
int main() {
int nDevices;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess) printf("%s\n", cudaGetErrorString(err));
for (int i=0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
return 0;
}
这段代码在下面处有改变:
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess)
printf("%s\n", cudaGetErrorString(err));
2、代码段二:
#include <iostream>
#include <math.h>
#include <stdio.h>
__global__
void saxpy(int n,float a,float *x,float *y)
{
int i = blockIdx.x*blockDim.x +threadIdx.x;
if (i < n) y[i] = a*x[i] + y[i];
}
int main(void){
int N = 1 << 20; //1M element.
//float *x=new float[N];
//float *y=new float[N];
//Allocate Unified Memory -- accessible from CPU or GPU
float *x, *y, *d_x, *d_y;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
//initialize x and y arrays on the host.
for (int i=0;i<N;i++){
x[i]=1.0f;
y[i]=2.0f;
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
saxpy<<< (N+255)/256, 256>>>(N, 2.0, d_x, d_y);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n",cudaGetErrorString(errAsync));
cudaEventRecord(stop);
cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
// Check for errors (all values should be 3.0f)
float maxError=0.0f;
for (int i=0;i<N;i++)
maxError=max(maxError,fabs(y[i]-4.0f));
printf("Max error: %f . \n", maxError);
printf("Effective Bandwidth (GB/s): %f .\n", N*4*3/milliseconds/1e6);
cudaFree(x);
cudaFree(y);
cudaFree(d_x);
cudaFree(d_y);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}