//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21124049
// Cuda compilation tools, release 8.0, V8.0.44
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_20
.address_size 64
// .globl _Z6KernelP4NodePiPbS2_S1_S2_i
.visible .entry _Z6KernelP4NodePiPbS2_S1_S2_i(
.param .u64 _Z6KernelP4NodePiPbS2_S1_S2_i_param_0,
.param .u64 _Z6KernelP4NodePiPbS2_S1_S2_i_param_1,
.param .u64 _Z6KernelP4NodePiPbS2_S1_S2_i_param_2,
.param .u64 _Z6KernelP4NodePiPbS2_S1_S2_i_param_3,
.param .u64 _Z6KernelP4NodePiPbS2_S1_S2_i_param_4,
.param .u64 _Z6KernelP4NodePiPbS2_S1_S2_i_param_5,
.param .u32 _Z6KernelP4NodePiPbS2_S1_S2_i_param_6
)
{
.reg .pred %p<6>
.reg .b16 %rs<6>
.reg .b32 %r<26>
.reg .b64 %rd<33>
//g_graph_mask[tid] == 1 表示未被访问过
//
ld.param.u64 %rd10, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_0]
ld.param.u64 %rd11, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_1]
ld.param.u64 %rd12, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_2]
ld.param.u64 %rd15, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_3]
ld.param.u64 %rd13, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_4]
ld.param.u64 %rd14, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_5]
ld.param.u32 %r12, [_Z6KernelP4NodePiPbS2_S1_S2_i_param_6]
cvta.to.global.u64 %rd1, %rd15
mov.u32 %r13, %ctaid.x
shl.b32 %r14, %r13, 8
mov.u32 %r15, %tid.x
add.s32 %r1, %r14, %r15
setp.ge.s32 %p1, %r1, %r12
@%p1 bra BB0_7
cvt.s64.s32 %rd2, %r1
cvta.to.global.u64 %rd16, %rd12
add.s64 %rd3, %rd16, %rd2
ld.global.u8 %rs1, [%rd3]
setp.eq.s16 %p2, %rs1, 0
@%p2 bra BB0_7
cvta.to.global.u64 %rd17, %rd10
mov.u16 %rs2, 0
st.global.u8 [%rd3], %rs2
add.s64 %rd18, %rd1, %rd2
mov.u16 %rs3, 1
st.global.u8 [%rd18], %rs3
shl.b64 %rd19, %rd2, 3
add.s64 %rd20, %rd17, %rd19
add.s64 %rd4, %rd20, 4
ld.global.u32 %r19, [%rd20+4]
setp.lt.s32 %p3, %r19, 1
@%p3 bra BB0_7
cvta.to.global.u64 %rd21, %rd13
shl.b64 %rd22, %rd2, 2
add.s64 %rd5, %rd21, %rd22
ld.global.u32 %r24, [%rd4+-4]
cvta.to.global.u64 %rd23, %rd11
mul.wide.s32 %rd24, %r24, 4
add.s64 %rd32, %rd23, %rd24
cvta.to.global.u64 %rd31, %rd14
mov.u32 %r23, %r24
BB0_4:
mov.u32 %r21, %r24
mov.u32 %r25, %r21
ld.global.s32 %rd8, [%rd32]
add.s64 %rd25, %rd1, %rd8
ld.global.u8 %rs4, [%rd25]
setp.ne.s16 %p4, %rs4, 0
@%p4 bra BB0_6
ld.global.u32 %r16, [%rd5]
add.s32 %r17, %r16, 1
shl.b64 %rd27, %rd8, 2
add.s64 %rd28, %rd21, %rd27
st.global.u32 [%rd28], %r17
add.s64 %rd30, %rd16, %rd8
st.global.u8 [%rd30], %rs3
st.global.u8 [%rd31], %rs3
ld.global.u32 %r25, [%rd4+-4]
BB0_6:
mov.u32 %r24, %r25
add.s64 %rd32, %rd32, 4
add.s32 %r18, %r24, %r19
add.s32 %r23, %r23, 1
setp.lt.s32 %p5, %r23, %r18
@%p5 bra BB0_4
BB0_7:
ret
}
程序流程图 bfs.sass