Analyse VecAdd.cu

source code, VecAdd.cu

ContractedBlock.gif ExpandedBlockStart.gif VecAdd.cu
 
   
#include < iostream >
#define MAXN 128

__global__
void VecAdd( float * A, float * B, float * C ) {
int i = threadIdx.x;
for ( ; i < MAXN; i += 128 ) {
C[i]
= A[i] + B[i];
}
}

int main( int argc, char ** argv ) {
float h_A[MAXN], h_B[MAXN], h_C[MAXN];
float * d_A, * d_B, * d_C;
int i;
for ( i = 0 ; i < MAXN; i ++ ) {
h_A[i]
= i;
h_B[i]
= 1 ;
h_C[i]
= 0 ;
}

int size = MAXN * sizeof ( float );
cudaMalloc( (
void ** ) & d_A, size );
cudaMalloc( (
void ** ) & d_B, size );
cudaMalloc( (
void ** ) & d_C, size );

cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, h_B, size, cudaMemcpyHostToDevice );

VecAdd
<<< 1 , 128 >>> ( d_A, d_B, d_C );

cudaMemcpy( h_C, d_C, size, cudaMemcpyDeviceToHost );

for ( i = 0 ; i < MAXN; i ++ ) {
std::cout
<< h_C[i] << " " ;
}

std::cout
<< std::endl;

return 0 ;
}

Using the command to get all the intermediate files.

nvcc -keep VecAdd.cu


PTX code, VecAdd.ptx

ContractedBlock.gif ExpandedBlockStart.gif VecAdd.ptx
 
   
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib // be
// nvopencc 3.2 built on 2010-11-03

// -----------------------------------------------------------
// Compiling /tmp/tmpxft_00007c2a_00000000-9_VecAdd.cpp3.i (/tmp/ccBI#.VBTmkR)
// -----------------------------------------------------------

// -----------------------------------------------------------
// Options:
// -----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)-
// -m2 (Report advisories)
// -----------------------------------------------------------

.file
1 " <command-line> "
.file
2 " /tmp/tmpxft_00007c2a_00000000-8_VecAdd.cudafe2.gpu "
.file
3 " /usr/lib/gcc/i486-linux-gnu/4.2.4/include/stddef.h "
.file
4 " /usr/local/cuda/bin/../include/crt/device_runtime.h "
.file
5 " /usr/local/cuda/bin/../include/host_defines.h "
.file
6 " /usr/local/cuda/bin/../include/builtin_types.h "
.file
7 " /usr/local/cuda/bin/../include/device_types.h "
.file
8 " /usr/local/cuda/bin/../include/driver_types.h "
.file
9 " /usr/local/cuda/bin/../include/surface_types.h "
.file
10 " /usr/local/cuda/bin/../include/texture_types.h "
.file
11 " /usr/local/cuda/bin/../include/vector_types.h "
.file
12 " /usr/local/cuda/bin/../include/device_launch_parameters.h "
.file
13 " /usr/local/cuda/bin/../include/crt/storage_class.h "
.file
14 " /usr/include/bits/types.h "
.file
15 " /usr/include/time.h "
.file
16 " /usr/local/cuda/bin/../include/texture_fetch_functions.h "
.file
17 " /usr/local/cuda/bin/../include/common_functions.h "
.file
18 " /usr/local/cuda/bin/../include/math_functions.h "
.file
19 " /usr/local/cuda/bin/../include/math_constants.h "
.file
20 " /usr/local/cuda/bin/../include/device_functions.h "
.file
21 " /usr/local/cuda/bin/../include/sm_11_atomic_functions.h "
.file
22 " /usr/local/cuda/bin/../include/sm_12_atomic_functions.h "
.file
23 " /usr/local/cuda/bin/../include/sm_13_double_functions.h "
.file
24 " /usr/local/cuda/bin/../include/sm_20_atomic_functions.h "
.file
25 " /usr/local/cuda/bin/../include/sm_20_intrinsics.h "
.file
26 " /usr/local/cuda/bin/../include/surface_functions.h "
.file
27 " /usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h "
.file
28 " VecAdd.cu "


.entry _Z6VecAddPfS_S_ (
.param .u32 __cudaparm__Z6VecAddPfS_S__A,
.param .u32 __cudaparm__Z6VecAddPfS_S__B,
.param .u32 __cudaparm__Z6VecAddPfS_S__C)
{
.reg .u32
% r < 20 > ;
.reg .f32
% f < 5 > ;
.reg .pred
% p < 4 > ;
.loc
28 4 0
$LDWbegin__Z6VecAddPfS_S_:
.loc
28 5 0
cvt.s32.u16
% r1, % tid.x; // r1 = i , covert u16 to s32
mov.u32 % r2, 127 ; // r2 = 127
setp.gt.s32 % p1, % r1, % r2; // p1 = i > 127
@ % p1 bra $Lt_0_1282; // jmp p1
mov.s32 % r3, 255 ; // r3 = 255
sub.s32 % r4, % r3, % r1; // r4 = 255 - i
shr.s32 % r5, % r4, 31 ; // r5 = r4 >>> 31
mov.s32 % r6, 127 ; // r6 = 127
and.b32 % r7, % r5, % r6; // r7 = ((255 - i) >>> 31) & 127
add.s32 % r8, % r7, % r4; // r8 = 255 - i + (((255 - i) >>> 31) & 127)
shr.s32 % r9, % r8, 7 ; // r9 = r8 >>> 7
mul24.lo.u32 % r10, % r1, 4 ; // offset for &A[i] because sizeof(float)=4
ld.param.u32 % r11, [__cudaparm__Z6VecAddPfS_S__A]; // A[0]
add.u32 % r12, % r10, % r11; // addr for &A[i] because sizeof(float)=4
add.u32 % r13, % r11, 508 ; // 512-4, addr for last byte of 128*4 bytes, shows the MAXN here
ld.param.u32 % r14, [__cudaparm__Z6VecAddPfS_S__B]; // B[0]
add.u32 % r15, % r14, % r10; // addr for &B[i] because sizeof(float)=4
ld.param.u32 % r16, [__cudaparm__Z6VecAddPfS_S__C]; // C[0]
add.u32 % r17, % r16, % r10; // addr for &C[i] because sizeof(float)=4
mov.s32 % r18, % r9; // r18 = r9
$Lt_0_1794:
// <loop> Loop body line 5, nesting depth: 1, estimated iterations: unknown
.loc 28 7 0
ld.
global .f32 % f1, [ % r12 + 0 ];
ld.
global .f32 % f2, [ % r15 + 0 ];
add.f32
% f3, % f1, % f2; // C[i] = A[i] +B[i]
st. global .f32 [ % r17 + 0 ], % f3;
add.u32
% r17, % r17, 512 ;
add.u32
% r15, % r15, 512 ;
add.u32
% r12, % r12, 512 ;
setp.le.u32
% p2, % r12, % r13;
@
% p2 bra $Lt_0_1794;
$Lt_0_1282:
.loc
28 9 0
exit;
$LDWend__Z6VecAddPfS_S_:
}
// _Z6VecAddPfS_S_

转载于:https://www.cnblogs.com/soulnearby/archive/2011/04/07/2008229.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值