CUDA内存(四) 全局内存

全局内存

合并访问

struc_add.h

#pragma once

#include "Global.h"
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define NUM_ELEMENTS 4096

typedef struct  
{
	u32 a;
	u32 b;
	u32 c;
	u32 d;
} INTERLEAVED_T;

typedef INTERLEAVED_T INTERLEAVED_ARRAY_T[NUM_ELEMENTS];
typedef u32 ARRAY_MEMBER_T[NUM_ELEMENTS];

typedef struct
{
	ARRAY_MEMBER_T a;
	ARRAY_MEMBER_T b;
	ARRAY_MEMBER_T c;
	ARRAY_MEMBER_T d;
} NON_INTERLEAVED_T;

//************************************
// Method:     add_test_non_interleaved_cpu
// Brief:	   struct 内容非交叉 CPU
// Access:     public 
// Returns:    __host__ float
// Qualifier: 
// Param(I/O): NON_INTERLEAVED_T * const host_dest_ptr
// Param(I/O): const NON_INTERLEAVED_T * const host_src_ptr
// Param(I/O): const u32 iter
// Param(I/O): const u32 num_elements
//************************************
__host__ float add_test_non_interleaved_cpu(
	NON_INTERLEAVED_T * const host_dest_ptr,
	const NON_INTERLEAVED_T * const host_src_ptr,
	const u32 iter,
	const u32 num_elements);
//************************************
// Method:     add_test_interleaved_cpu
// Brief:	   struct 内容交叉 CPU
// Access:     public 
// Returns:    __host__ float
// Qualifier: 
// Param(I/O): INTERLEAVED_T * const host_dest_ptr
// Param(I/O): const INTERLEAVED_T * const host_src_ptr
// Param(I/O): const u32 iter
// Param(I/O): const u32 num_elements
//************************************
__host__ float add_test_interleaved_cpu(INTERLEAVED_T * const host_dest_ptr, const INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements);

//************************************
// Method:     add_kernel_interleaved
// Brief:	   struct 内容交叉 GPU 
// Access:     public 
// Returns:    __global__ void
// Qualifier: 
// Param(I/O): INTERLEAVED_T * const dest_ptr
// Param(I/O): const INTERLEAVED_T * const src_ptr
// Param(I/O): const u32 iter
// Param(I/O): const u32 num_elements
//************************************
__global__ void add_kernel_interleaved(INTERLEAVED_T* const dest_ptr, const INTERLEAVED_T * const src_ptr, const u32 iter, const u32 num_elements);

//************************************
// Method:     add_kernel_non_interleaved
// Brief:	   struct 内容非交叉 GPU
// Access:     public 
// Returns:    __global__ void
// Qualifier: 
// Param(I/O): NON_INTERLEAVED_T * const dest_ptr
// Param(I/O): const NON_INTERLEAVED_T * const src_ptr
// Param(I/O): const u32 iter
// Param(I/O): const u32 num_elements
//************************************
__global__ void add_kernel_non_interleaved(NON_INTERLEAVED_T * const dest_ptr, const NON_INTERLEAVED_T * const src_ptr, const u32 iter, const u32 num_elements);

//************************************
// Method:     add_test_interleaved
// Brief:	   struct 内容交叉 GPU host
// Access:     public 
// Returns:    __host__ float
// Qualifier: 
// Param(I/O): INTERLEAVED_T * const host_dest_ptr
// Param(I/O): const INTERLEAVED_T * const host_src_ptr
// Param(I/O): const u32 iter
// Param(I/O): const u32 num_elements
//************************************
__host__ float add_test_interleaved(INTERLEAVED_T * const host_dest_ptr, const INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements);

//************************************
// Method:     add_test_non_interleaved
// Brief:	   struct 内容非交叉 GPU host
// Access:     public 
// Returns:    __host__ float
// Qualifier: 
// Param(I/O): NON_INTERLEAVED_T * const dest_ptr
// Param(I/O): const NON_INTERLEAVED_T * const src_ptr
// Param(I/O): const u32 iter
// Param(I/O): const u32 num_elements
//************************************
__host__ float add_test_non_interleaved(NON_INTERLEAVED_T * const host_dest_ptr, const NON_INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements);

struct_add.cu

#include "struct_add.h"
#include <timer.h>

__host__ float add_test_non_interleaved_cpu(NON_INTERLEAVED_T * const host_dest_ptr, const NON_INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements)
{
	StartTimer();

	for (u32 tid = 0; tid < num_elements;tid++)
	{
		for (u32 i = 0; i < iter;i++)
		{
			host_dest_ptr->a[tid] += host_src_ptr->a[tid];
			host_dest_ptr->b[tid] += host_src_ptr->b[tid];
			host_dest_ptr->c[tid] += host_src_ptr->c[tid];
			host_dest_ptr->d[tid] += host_src_ptr->d[tid];
		}
	}
	const float delta = GetTimer();
	return delta;
}

__host__ float add_test_interleaved_cpu(INTERLEAVED_T * const host_dest_ptr, const INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements)
{
	StartTimer();
	for (u32 tid = 0; tid < num_elements;tid++)
	{
		for (u32 i = 0; i < iter;i++)
		{
			host_dest_ptr[tid].a += host_src_ptr[tid].a;
			host_dest_ptr[tid].b += host_src_ptr[tid].b;
			host_dest_ptr[tid].c += host_src_ptr[tid].c;
			host_dest_ptr[tid].d += host_src_ptr[tid].d;
		}
	}
	const float delta = GetTimer();
	return delta;
}
__global__ void add_kernel_interleaved(INTERLEAVED_T* const dest_ptr, const INTERLEAVED_T * const src_ptr, const u32 iter, const u32 num_elements)
{
	const u32 tid = blockIdx.x * blockDim.x + threadIdx.x;
	if (tid < num_elements)
	{
		for (u32 i = 0; i < iter; i++)
		{
			dest_ptr[tid].a += src_ptr[tid].a;
			dest_ptr[tid].b += src_ptr[tid].b;
			dest_ptr[tid].c += src_ptr[tid].c;
			dest_ptr[tid].d += src_ptr[tid].d;
		}
	}
}

__global__ void add_kernel_non_interleaved(NON_INTERLEAVED_T * const dest_ptr, const NON_INTERLEAVED_T * const src_ptr, const u32 iter, const u32 num_elements)
{
	const u32 tid = blockIdx.x * blockDim.x + threadIdx.x;
	if (tid < num_elements)
	{
		for (u32 i = 0; i < iter;i++)
		{
			dest_ptr->a[tid] += src_ptr->a[tid];
			dest_ptr->b[tid] += src_ptr->b[tid];
			dest_ptr->c[tid] += src_ptr->c[tid];
			dest_ptr->d[tid] += src_ptr->d[tid];
		}
	}
}

__host__ float add_test_interleaved(INTERLEAVED_T * const host_dest_ptr, const INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements)
{
	const u32 num_threads = 256;
	const u32 num_blocks = (num_elements + num_threads - 1) / num_threads;
	const size_t num_bytes = sizeof(INTERLEAVED_T)*num_elements;
	INTERLEAVED_T * device_dest_ptr;
	INTERLEAVED_T * device_src_ptr;
	CUDA_CALL(cudaMalloc((void**)&device_src_ptr, num_bytes));
	CUDA_CALL(cudaMalloc((void**)&device_dest_ptr, num_bytes));

	cudaEvent_t kernel_start, kernel_stop;
	cudaEventCreate(&kernel_start, 0);
	cudaEventCreate(&kernel_stop, 0);
	cudaStream_t test_stream;
	CUDA_CALL(cudaStreamCreate(&test_stream));
	CUDA_CALL(cudaMemcpy(device_src_ptr, host_src_ptr, num_bytes, cudaMemcpyHostToDevice));
	CUDA_CALL(cudaEventRecord(kernel_start, 0));
	
	add_kernel_interleaved << <num_blocks, num_threads >> >(device_dest_ptr, device_src_ptr, iter, num_elements);
	CUDA_CALL(cudaEventRecord(kernel_stop, 0));
	CUDA_CALL(cudaEventSynchronize(kernel_stop));
	float delta = 0.0f;
	CUDA_CALL(cudaEventElapsedTime(&delta, kernel_start, kernel_stop));

	CUDA_CALL(cudaFree(device_src_ptr));
	CUDA_CALL(cudaFree(device_dest_ptr));
	CUDA_CALL(cudaEventDestroy(kernel_start));
	CUDA_CALL(cudaEventDestroy(kernel_stop));
	CUDA_CALL(cudaStreamDestroy(test_stream));

	return delta;

}

__host__ float add_test_non_interleaved(NON_INTERLEAVED_T * const host_dest_ptr, const NON_INTERLEAVED_T * const host_src_ptr, const u32 iter, const u32 num_elements)
{
	const u32 num_threads = 256;
	const u32 num_blocks = (num_elements + num_threads - 1) / num_threads;
	// 应该是 NON_INTERLEAVED_T
	const size_t num_bytes = sizeof(NON_INTERLEAVED_T);// *num_elements;
	NON_INTERLEAVED_T * device_dest_ptr;
	NON_INTERLEAVED_T * device_src_ptr;
	CUDA_CALL(cudaMalloc((void**)&device_src_ptr, num_bytes));
	CUDA_CALL(cudaMalloc((void**)&device_dest_ptr, num_bytes));

	cudaEvent_t kernel_start, kernel_stop;
	cudaEventCreate(&kernel_start, 0);
	cudaEventCreate(&kernel_stop, 0);
	cudaStream_t test_stream;
	CUDA_CALL(cudaStreamCreate(&test_stream));
	CUDA_CALL(cudaMemcpy(device_src_ptr, host_src_ptr, num_bytes, cudaMemcpyHostToDevice));
	CUDA_CALL(cudaEventRecord(kernel_start, 0));

	add_kernel_non_interleaved << <num_blocks, num_threads >> >(device_dest_ptr, device_src_ptr, iter, num_elements);
	CUDA_CALL(cudaEventRecord(kernel_stop, 0));
	CUDA_CALL(cudaEventSynchronize(kernel_stop));
	float delta = 0.0f;
	CUDA_CALL(cudaEventElapsedTime(&delta, kernel_start, kernel_stop));

	CUDA_CALL(cudaFree(device_src_ptr));
	CUDA_CALL(cudaFree(device_dest_ptr));
	CUDA_CALL(cudaEventDestroy(kernel_start));
	CUDA_CALL(cudaEventDestroy(kernel_stop));
	CUDA_CALL(cudaStreamDestroy(test_stream));

	return delta;
}

struct_add_main.cpp

/* CPU和GPU全局内存, 分别用交叉和非交叉方式来计算struct元素之和.*/
#include "Global.h"
#include "struct_add.h"

int main()
{
	getCpuInfo();
	getGpuInfo();
	// 数据初始化
	INTERLEAVED_T * host_src_interleaved_ptr = NULL;
	INTERLEAVED_T * host_dest_interleaved_ptr = NULL;
	host_src_interleaved_ptr = (INTERLEAVED_T *)malloc(sizeof(INTERLEAVED_T)*NUM_ELEMENTS);
	host_dest_interleaved_ptr = (INTERLEAVED_T *)malloc(sizeof(INTERLEAVED_T)*NUM_ELEMENTS);
	NON_INTERLEAVED_T host_src_non_interleaved_ptr;
	NON_INTERLEAVED_T host_dest_non_interleaved_ptr;
	for (int i = 0; i < NUM_ELEMENTS;i++)
	{
		host_src_interleaved_ptr[i].a = rand() % 4;
		host_src_interleaved_ptr[i].b = rand() % 4;
		host_src_interleaved_ptr[i].c = rand() % 4;
		host_src_interleaved_ptr[i].d = rand() % 4;

		host_src_non_interleaved_ptr.a[i] = rand() % 4;
		host_src_non_interleaved_ptr.b[i] = rand() % 4;
		host_src_non_interleaved_ptr.c[i] = rand() % 4;
		host_src_non_interleaved_ptr.d[i] = rand() % 4;

	}
	const u32 iter = 10000;
	// CPU 非交叉
	float delta_non_interleaved_cpu = add_test_non_interleaved_cpu(&host_dest_non_interleaved_ptr, &host_src_non_interleaved_ptr, iter, NUM_ELEMENTS);
	// CPU 交叉
	float delta_interleaved_cpu = add_test_interleaved_cpu(host_dest_interleaved_ptr, host_src_interleaved_ptr, iter, NUM_ELEMENTS);
	// GPU 非交叉
	float delta_non_interleaved_gpu = add_test_non_interleaved(&host_dest_non_interleaved_ptr, &host_src_non_interleaved_ptr, iter, NUM_ELEMENTS);
	delta_non_interleaved_gpu = add_test_non_interleaved(&host_dest_non_interleaved_ptr, &host_src_non_interleaved_ptr, iter, NUM_ELEMENTS);
	// GPU 交叉
	float delta_interleaved_gpu = add_test_interleaved(host_dest_interleaved_ptr, host_src_interleaved_ptr, iter, NUM_ELEMENTS);
	delta_interleaved_gpu = add_test_interleaved(host_dest_interleaved_ptr, host_src_interleaved_ptr, iter, NUM_ELEMENTS);

	// 输出
	printf("耗时统计:\n");
	printf("CPU\t非交叉: \t%.3f ms\n", delta_non_interleaved_cpu);
	printf("CPU\t交叉: \t\t%.3f ms\n", delta_interleaved_cpu);
	printf("GPU\t非交叉: \t%.3f ms\n", delta_non_interleaved_gpu);
	printf("GPU\t交叉: \t\t%.3f ms\n", delta_interleaved_gpu);
	return 0;
}

测试结果

相对来说, CPU的交叉形式比CPU的非交叉形式快.
GPU相反, 非交叉比交叉快.
但是, 测试结果的差异没有书上的大.
自己平台的测试, 6.0计算能力

ShaneCook 结果 page 164

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值