#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define N 3 //类似数组的行
#define M 5 //类似数组的列
#define GridSize 16
#define BlockSize 16
#include<iostream>
using namespace std;
__global__ void kernel(float * d_matrix, size_t pitch) {
int count = 1;
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < N; j += blockDim.y * gridDim.y)
{
float* row_d_matrix = (float*)((char*)d_matrix + j*pitch);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x)
{
row_d_matrix[i] = count;
count++;
}
}
}
int main()
{
float *d_matrix;
float *dc_matrix = new float[M*N];
//dc_matrix = (float*)malloc(sizeof(float)*M*N);
size_t pitch;
cudaMallocPitch(&d_matrix, &pitch, M*sizeof(float), N);
for (int i = 0; i < M*N; i++)
dc_matrix[i] = i;
for (int i = 0; i < M*N; i++)
printf("%.2f ", dc_matrix[i]);
printf("\n");
cudaMemcpy2D(d_matrix, pitch, dc_matrix, M* sizeof(float), M * sizeof(float), N, cudaMemcpyHostToDevice);
kernel << <GridSize, BlockSize >> >(d_matrix, pitch);
cudaMemcpy2D(dc_matrix, M * sizeof(float), d_matrix, pitch, M * sizeof(float), N, cudaMemcpyDeviceToHost);
for (int i = 0; i < M*N; i++)
printf("%.2f ", dc_matrix[i]);
cudaFree(d_matrix);
free(dc_matrix);
return 0;
}
CUDA中的cudaMemcpy2D和cudaMallocPitch使用详解
最新推荐文章于 2025-03-16 18:29:22 发布