分层分装
#include
<
iostream
>
using namespace std;
__device__ int offset = 1 ;
struct A_device {
int mH;
int mW;
int * m_d_Dynamic;
int mDynamicSize;
int func( int );
};
class A {
public :
A_device d;
int * m_h_Dynamic;
A(): d((A_device){ 0 , 0 ,NULL, sizeof ( int ) * 4 }), m_h_Dynamic(NULL) {
allocMem();
}
~ A() { freeMem(); }
void run_device( int * h_sum, size_t MAXN);
void setValue() {
m_h_Dynamic[ 0 ] = 1000 ;
m_h_Dynamic[ 1 ] = 10000 ;
m_h_Dynamic[ 2 ] = 100000 ;
m_h_Dynamic[ 3 ] = 1000000 ;
cudaMemcpy( d.m_d_Dynamic, m_h_Dynamic, d.mDynamicSize, cudaMemcpyHostToDevice);
}
private :
void freeMem();
void allocMem();
};
__device__
int
A_device::func( int i) {
return m_d_Dynamic[i % 4 ];
}
__global__ void KernelTest( int * sum, A_device d_insA) { // copy h_insA to d_insA in call stack, then copy d_insA from host to device
sum[threadIdx.x] = threadIdx.x + d_insA.func(threadIdx.x) + offset;
}
__host__
void
A::freeMem() {
free(m_h_Dynamic);
cudaFree(d.m_d_Dynamic);
}
void
A::allocMem() {
m_h_Dynamic = ( int * )malloc(d.mDynamicSize);
cudaMalloc( ( void ** ) & d.m_d_Dynamic, d.mDynamicSize);
}
void
A::run_device( int * h_sum, size_t MAXN) {
int * d_sum;
setValue();
cudaMalloc( ( void ** ) & d_sum, MAXN * sizeof ( int ));
cudaMemset( d_sum, 0 , MAXN * sizeof ( int ));
KernelTest <<< 1 , 128 >>> (d_sum, d);
cudaMemcpy( h_sum, d_sum, MAXN * sizeof ( int ), cudaMemcpyDeviceToHost);
}
int main( int argc, char ** argv) {
A h_insA;
int h_sum[ 128 ];
h_insA.run_device(h_sum, 128 );
for ( int i = 0 ; i < 128 ; i ++ ) {
std::cout << h_sum[i] << " " ;
}
cout << endl;
return 0 ;
}
using namespace std;
__device__ int offset = 1 ;
struct A_device {
int mH;
int mW;
int * m_d_Dynamic;
int mDynamicSize;
int func( int );
};
class A {
public :
A_device d;
int * m_h_Dynamic;
A(): d((A_device){ 0 , 0 ,NULL, sizeof ( int ) * 4 }), m_h_Dynamic(NULL) {
allocMem();
}
~ A() { freeMem(); }
void run_device( int * h_sum, size_t MAXN);
void setValue() {
m_h_Dynamic[ 0 ] = 1000 ;
m_h_Dynamic[ 1 ] = 10000 ;
m_h_Dynamic[ 2 ] = 100000 ;
m_h_Dynamic[ 3 ] = 1000000 ;
cudaMemcpy( d.m_d_Dynamic, m_h_Dynamic, d.mDynamicSize, cudaMemcpyHostToDevice);
}
private :
void freeMem();
void allocMem();
};
__device__
int
A_device::func( int i) {
return m_d_Dynamic[i % 4 ];
}
__global__ void KernelTest( int * sum, A_device d_insA) { // copy h_insA to d_insA in call stack, then copy d_insA from host to device
sum[threadIdx.x] = threadIdx.x + d_insA.func(threadIdx.x) + offset;
}
__host__
void
A::freeMem() {
free(m_h_Dynamic);
cudaFree(d.m_d_Dynamic);
}
void
A::allocMem() {
m_h_Dynamic = ( int * )malloc(d.mDynamicSize);
cudaMalloc( ( void ** ) & d.m_d_Dynamic, d.mDynamicSize);
}
void
A::run_device( int * h_sum, size_t MAXN) {
int * d_sum;
setValue();
cudaMalloc( ( void ** ) & d_sum, MAXN * sizeof ( int ));
cudaMemset( d_sum, 0 , MAXN * sizeof ( int ));
KernelTest <<< 1 , 128 >>> (d_sum, d);
cudaMemcpy( h_sum, d_sum, MAXN * sizeof ( int ), cudaMemcpyDeviceToHost);
}
int main( int argc, char ** argv) {
A h_insA;
int h_sum[ 128 ];
h_insA.run_device(h_sum, 128 );
for ( int i = 0 ; i < 128 ; i ++ ) {
std::cout << h_sum[i] << " " ;
}
cout << endl;
return 0 ;
}