source code, VecAdd.cu
VecAdd.cu
#include
<
iostream
>
#define MAXN 128
__global__ void VecAdd( float * A, float * B, float * C ) {
int i = threadIdx.x;
for ( ; i < MAXN; i += 128 ) {
C[i] = A[i] + B[i];
}
}
int main( int argc, char ** argv ) {
float h_A[MAXN], h_B[MAXN], h_C[MAXN];
float * d_A, * d_B, * d_C;
int i;
for ( i = 0 ; i < MAXN; i ++ ) {
h_A[i] = i;
h_B[i] = 1 ;
h_C[i] = 0 ;
}
int size = MAXN * sizeof ( float );
cudaMalloc( ( void ** ) & d_A, size );
cudaMalloc( ( void ** ) & d_B, size );
cudaMalloc( ( void ** ) & d_C, size );
cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, h_B, size, cudaMemcpyHostToDevice );
VecAdd <<< 1 , 128 >>> ( d_A, d_B, d_C );
cudaMemcpy( h_C, d_C, size, cudaMemcpyDeviceToHost );
for ( i = 0 ; i < MAXN; i ++ ) {
std::cout << h_C[i] << " " ;
}
std::cout << std::endl;
return 0 ;
}
#define MAXN 128
__global__ void VecAdd( float * A, float * B, float * C ) {
int i = threadIdx.x;
for ( ; i < MAXN; i += 128 ) {
C[i] = A[i] + B[i];
}
}
int main( int argc, char ** argv ) {
float h_A[MAXN], h_B[MAXN], h_C[MAXN];
float * d_A, * d_B, * d_C;
int i;
for ( i = 0 ; i < MAXN; i ++ ) {
h_A[i] = i;
h_B[i] = 1 ;
h_C[i] = 0 ;
}
int size = MAXN * sizeof ( float );
cudaMalloc( ( void ** ) & d_A, size );
cudaMalloc( ( void ** ) & d_B, size );
cudaMalloc( ( void ** ) & d_C, size );
cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, h_B, size, cudaMemcpyHostToDevice );
VecAdd <<< 1 , 128 >>> ( d_A, d_B, d_C );
cudaMemcpy( h_C, d_C, size, cudaMemcpyDeviceToHost );
for ( i = 0 ; i < MAXN; i ++ ) {
std::cout << h_C[i] << " " ;
}
std::cout << std::endl;
return 0 ;
}
Using the command to get all the intermediate files.
nvcc -keep VecAdd.cu
PTX code, VecAdd.ptx
VecAdd.ptx
.version
1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib // be
// nvopencc 3.2 built on 2010-11-03
// -----------------------------------------------------------
// Compiling /tmp/tmpxft_00007c2a_00000000-9_VecAdd.cpp3.i (/tmp/ccBI#.VBTmkR)
// -----------------------------------------------------------
// -----------------------------------------------------------
// Options:
// -----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)-
// -m2 (Report advisories)
// -----------------------------------------------------------
.file 1 " <command-line> "
.file 2 " /tmp/tmpxft_00007c2a_00000000-8_VecAdd.cudafe2.gpu "
.file 3 " /usr/lib/gcc/i486-linux-gnu/4.2.4/include/stddef.h "
.file 4 " /usr/local/cuda/bin/../include/crt/device_runtime.h "
.file 5 " /usr/local/cuda/bin/../include/host_defines.h "
.file 6 " /usr/local/cuda/bin/../include/builtin_types.h "
.file 7 " /usr/local/cuda/bin/../include/device_types.h "
.file 8 " /usr/local/cuda/bin/../include/driver_types.h "
.file 9 " /usr/local/cuda/bin/../include/surface_types.h "
.file 10 " /usr/local/cuda/bin/../include/texture_types.h "
.file 11 " /usr/local/cuda/bin/../include/vector_types.h "
.file 12 " /usr/local/cuda/bin/../include/device_launch_parameters.h "
.file 13 " /usr/local/cuda/bin/../include/crt/storage_class.h "
.file 14 " /usr/include/bits/types.h "
.file 15 " /usr/include/time.h "
.file 16 " /usr/local/cuda/bin/../include/texture_fetch_functions.h "
.file 17 " /usr/local/cuda/bin/../include/common_functions.h "
.file 18 " /usr/local/cuda/bin/../include/math_functions.h "
.file 19 " /usr/local/cuda/bin/../include/math_constants.h "
.file 20 " /usr/local/cuda/bin/../include/device_functions.h "
.file 21 " /usr/local/cuda/bin/../include/sm_11_atomic_functions.h "
.file 22 " /usr/local/cuda/bin/../include/sm_12_atomic_functions.h "
.file 23 " /usr/local/cuda/bin/../include/sm_13_double_functions.h "
.file 24 " /usr/local/cuda/bin/../include/sm_20_atomic_functions.h "
.file 25 " /usr/local/cuda/bin/../include/sm_20_intrinsics.h "
.file 26 " /usr/local/cuda/bin/../include/surface_functions.h "
.file 27 " /usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h "
.file 28 " VecAdd.cu "
.entry _Z6VecAddPfS_S_ (
.param .u32 __cudaparm__Z6VecAddPfS_S__A,
.param .u32 __cudaparm__Z6VecAddPfS_S__B,
.param .u32 __cudaparm__Z6VecAddPfS_S__C)
{
.reg .u32 % r < 20 > ;
.reg .f32 % f < 5 > ;
.reg .pred % p < 4 > ;
.loc 28 4 0
$LDWbegin__Z6VecAddPfS_S_:
.loc 28 5 0
cvt.s32.u16 % r1, % tid.x; // r1 = i , covert u16 to s32
mov.u32 % r2, 127 ; // r2 = 127
setp.gt.s32 % p1, % r1, % r2; // p1 = i > 127
@ % p1 bra $Lt_0_1282; // jmp p1
mov.s32 % r3, 255 ; // r3 = 255
sub.s32 % r4, % r3, % r1; // r4 = 255 - i
shr.s32 % r5, % r4, 31 ; // r5 = r4 >>> 31
mov.s32 % r6, 127 ; // r6 = 127
and.b32 % r7, % r5, % r6; // r7 = ((255 - i) >>> 31) & 127
add.s32 % r8, % r7, % r4; // r8 = 255 - i + (((255 - i) >>> 31) & 127)
shr.s32 % r9, % r8, 7 ; // r9 = r8 >>> 7
mul24.lo.u32 % r10, % r1, 4 ; // offset for &A[i] because sizeof(float)=4
ld.param.u32 % r11, [__cudaparm__Z6VecAddPfS_S__A]; // A[0]
add.u32 % r12, % r10, % r11; // addr for &A[i] because sizeof(float)=4
add.u32 % r13, % r11, 508 ; // 512-4, addr for last byte of 128*4 bytes, shows the MAXN here
ld.param.u32 % r14, [__cudaparm__Z6VecAddPfS_S__B]; // B[0]
add.u32 % r15, % r14, % r10; // addr for &B[i] because sizeof(float)=4
ld.param.u32 % r16, [__cudaparm__Z6VecAddPfS_S__C]; // C[0]
add.u32 % r17, % r16, % r10; // addr for &C[i] because sizeof(float)=4
mov.s32 % r18, % r9; // r18 = r9
$Lt_0_1794:
// <loop> Loop body line 5, nesting depth: 1, estimated iterations: unknown
.loc 28 7 0
ld. global .f32 % f1, [ % r12 + 0 ];
ld. global .f32 % f2, [ % r15 + 0 ];
add.f32 % f3, % f1, % f2; // C[i] = A[i] +B[i]
st. global .f32 [ % r17 + 0 ], % f3;
add.u32 % r17, % r17, 512 ;
add.u32 % r15, % r15, 512 ;
add.u32 % r12, % r12, 512 ;
setp.le.u32 % p2, % r12, % r13;
@ % p2 bra $Lt_0_1794;
$Lt_0_1282:
.loc 28 9 0
exit;
$LDWend__Z6VecAddPfS_S_:
} // _Z6VecAddPfS_S_
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib // be
// nvopencc 3.2 built on 2010-11-03
// -----------------------------------------------------------
// Compiling /tmp/tmpxft_00007c2a_00000000-9_VecAdd.cpp3.i (/tmp/ccBI#.VBTmkR)
// -----------------------------------------------------------
// -----------------------------------------------------------
// Options:
// -----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)-
// -m2 (Report advisories)
// -----------------------------------------------------------
.file 1 " <command-line> "
.file 2 " /tmp/tmpxft_00007c2a_00000000-8_VecAdd.cudafe2.gpu "
.file 3 " /usr/lib/gcc/i486-linux-gnu/4.2.4/include/stddef.h "
.file 4 " /usr/local/cuda/bin/../include/crt/device_runtime.h "
.file 5 " /usr/local/cuda/bin/../include/host_defines.h "
.file 6 " /usr/local/cuda/bin/../include/builtin_types.h "
.file 7 " /usr/local/cuda/bin/../include/device_types.h "
.file 8 " /usr/local/cuda/bin/../include/driver_types.h "
.file 9 " /usr/local/cuda/bin/../include/surface_types.h "
.file 10 " /usr/local/cuda/bin/../include/texture_types.h "
.file 11 " /usr/local/cuda/bin/../include/vector_types.h "
.file 12 " /usr/local/cuda/bin/../include/device_launch_parameters.h "
.file 13 " /usr/local/cuda/bin/../include/crt/storage_class.h "
.file 14 " /usr/include/bits/types.h "
.file 15 " /usr/include/time.h "
.file 16 " /usr/local/cuda/bin/../include/texture_fetch_functions.h "
.file 17 " /usr/local/cuda/bin/../include/common_functions.h "
.file 18 " /usr/local/cuda/bin/../include/math_functions.h "
.file 19 " /usr/local/cuda/bin/../include/math_constants.h "
.file 20 " /usr/local/cuda/bin/../include/device_functions.h "
.file 21 " /usr/local/cuda/bin/../include/sm_11_atomic_functions.h "
.file 22 " /usr/local/cuda/bin/../include/sm_12_atomic_functions.h "
.file 23 " /usr/local/cuda/bin/../include/sm_13_double_functions.h "
.file 24 " /usr/local/cuda/bin/../include/sm_20_atomic_functions.h "
.file 25 " /usr/local/cuda/bin/../include/sm_20_intrinsics.h "
.file 26 " /usr/local/cuda/bin/../include/surface_functions.h "
.file 27 " /usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h "
.file 28 " VecAdd.cu "
.entry _Z6VecAddPfS_S_ (
.param .u32 __cudaparm__Z6VecAddPfS_S__A,
.param .u32 __cudaparm__Z6VecAddPfS_S__B,
.param .u32 __cudaparm__Z6VecAddPfS_S__C)
{
.reg .u32 % r < 20 > ;
.reg .f32 % f < 5 > ;
.reg .pred % p < 4 > ;
.loc 28 4 0
$LDWbegin__Z6VecAddPfS_S_:
.loc 28 5 0
cvt.s32.u16 % r1, % tid.x; // r1 = i , covert u16 to s32
mov.u32 % r2, 127 ; // r2 = 127
setp.gt.s32 % p1, % r1, % r2; // p1 = i > 127
@ % p1 bra $Lt_0_1282; // jmp p1
mov.s32 % r3, 255 ; // r3 = 255
sub.s32 % r4, % r3, % r1; // r4 = 255 - i
shr.s32 % r5, % r4, 31 ; // r5 = r4 >>> 31
mov.s32 % r6, 127 ; // r6 = 127
and.b32 % r7, % r5, % r6; // r7 = ((255 - i) >>> 31) & 127
add.s32 % r8, % r7, % r4; // r8 = 255 - i + (((255 - i) >>> 31) & 127)
shr.s32 % r9, % r8, 7 ; // r9 = r8 >>> 7
mul24.lo.u32 % r10, % r1, 4 ; // offset for &A[i] because sizeof(float)=4
ld.param.u32 % r11, [__cudaparm__Z6VecAddPfS_S__A]; // A[0]
add.u32 % r12, % r10, % r11; // addr for &A[i] because sizeof(float)=4
add.u32 % r13, % r11, 508 ; // 512-4, addr for last byte of 128*4 bytes, shows the MAXN here
ld.param.u32 % r14, [__cudaparm__Z6VecAddPfS_S__B]; // B[0]
add.u32 % r15, % r14, % r10; // addr for &B[i] because sizeof(float)=4
ld.param.u32 % r16, [__cudaparm__Z6VecAddPfS_S__C]; // C[0]
add.u32 % r17, % r16, % r10; // addr for &C[i] because sizeof(float)=4
mov.s32 % r18, % r9; // r18 = r9
$Lt_0_1794:
// <loop> Loop body line 5, nesting depth: 1, estimated iterations: unknown
.loc 28 7 0
ld. global .f32 % f1, [ % r12 + 0 ];
ld. global .f32 % f2, [ % r15 + 0 ];
add.f32 % f3, % f1, % f2; // C[i] = A[i] +B[i]
st. global .f32 [ % r17 + 0 ], % f3;
add.u32 % r17, % r17, 512 ;
add.u32 % r15, % r15, 512 ;
add.u32 % r12, % r12, 512 ;
setp.le.u32 % p2, % r12, % r13;
@ % p2 bra $Lt_0_1794;
$Lt_0_1282:
.loc 28 9 0
exit;
$LDWend__Z6VecAddPfS_S_:
} // _Z6VecAddPfS_S_