cuda 入门 数组相加 矩阵相加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void VecAdd(float* A, float* B, float* C, int N){
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
int main(){
const int N = 5;
size_t size = N * sizeof(float);
//Allocate input vectors in host memory
float* h_A = (float*)malloc(size);
float* h_B = (float*)malloc(size);
float* h_C = (float*)malloc(size);
for (int i = 0; i < N; i++){
h_A[i] = rand()%10;
h_B[i] = rand()%10;
}
//Allocate vectors in device memory
float* d_A;
cudaMalloc(&d_A, size);
float* d_B;
cudaMalloc(&d_B, size);
float* d_C;
cudaMalloc(&d_C, size);
//Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//Invoke kernal
int threadsPerBlock = 16*16;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
//Copy result from device memory to host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
printf("%f%c", h_A[i], i == N - 1 ? '\n' : ' ');
}
for (int i = 0; i < N; i++){
printf("%f%c", h_B[i], i == N - 1 ? '\n' : ' ');
}
for (int i = 0; i < N; i++){
printf("%f%c", h_C[i], i == N - 1 ? '\n' : ' ');
}
//Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//Free host memory
free(h_A);
free(h_B);
free(h_C);
system("pause");
return 0;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
__global__ void MatAdd(float* A, float* B, float* C, int N , int M) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < N && j < M)
C[i * M + j] = A[i * M + j] + B[i * M + j];
}
int main() {
const int N = 5;
const int M = 3;
size_t size = N * M * sizeof(float);
//Allocate input vectors in host memory
float* h_A = (float*)malloc(size);
float* h_B = (float*)malloc(size);
float* h_C = (float*)malloc(size);
for (int i = 0; i < N*M; i++) {
h_A[i] = rand() % 10;
h_B[i] = rand() % 10;
}
//Allocate vectors in device memory
float* d_A;
cudaMalloc(&d_A, size);
float* d_B;
cudaMalloc(&d_B, size);
float* d_C;
cudaMalloc(&d_C, size);
//Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//Invoke kernal
dim3 threadPerBlock(16, 16);
dim3 numBlocks((N + threadPerBlock.x - 1) / (threadPerBlock.x), (M + threadPerBlock.y - 1) / (threadPerBlock.y));
MatAdd <<< numBlocks, threadPerBlock >>> (d_A, d_B, d_C , N , M);
//Copy result from device memory to host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++) {
printf("%f%c", h_A[i * M + j], j == M - 1 ? '\n' : ' ');
}
printf("**************\n");
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++) {
printf("%f%c", h_B[i * M + j], j == M - 1 ? '\n' : ' ');
}
printf("**************\n");
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++) {
printf("%f%c", h_C[i * M + j], j == M - 1 ? '\n' : ' ');
}
printf("**************\n");
//Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//Free host memory
free(h_A);
free(h_B);
free(h_C);
system("pause");
return 0;
}