GitHub - ROCm-Developer-Tools/HIP-Examples: Examples for HIP
环境
gpu: Radeon Vii vega 7nm
操作系统:ubuntu 20.04
平台:ROCm 5.4.3
编译器:hipcc
加入了一样这:
asm volatile ("s_waitcnt lgkmcnt(0)");
功能:
影响:
Makefile
HIP_PATH?= $(wildcard /opt/rocm/hip)
ifeq (,$(HIP_PATH))
HIP_PATH=/opt/rocm
endif
HIPCC=$(HIP_PATH)/bin/hipcc
TARGET=hcc
SOURCES = vectoradd_hip.cpp
OBJECTS = $(SOURCES:.cpp=.o)
EXECUTABLE=./vectoradd_hip.exe
.PHONY: test
all: $(EXECUTABLE) test hello.s
CXXFLAGS =-g
CXX=$(HIPCC)
$(EXECUTABLE): $(OBJECTS)
$(HIPCC) $(OBJECTS) -o $@
hello.s: vectoradd_hip.cpp
$(HIPCC) $^ -S -o $@
test: $(EXECUTABLE)
$(EXECUTABLE)
clean:
rm -f $(EXECUTABLE)
rm -f $(OBJECTS)
rm -f $(HIP_PATH)/src/*.o
rm -f ./hello.s
vectoradd_hip.cpp
/*
Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <assert.h>
#include <stdio.h>
#include <algorithm>
#include <stdlib.h>
#include<iostream>
#include "hip/hip_runtime.h"
#ifdef NDEBUG
#define HIP_ASSERT(x) x
#else
#define HIP_ASSERT(x) (assert((x)==hipSuccess))
#endif
#define WIDTH 1024
#define HEIGHT 1024
#define NUM (WIDTH*HEIGHT)
#define THREADS_PER_BLOCK_X 16
#define THREADS_PER_BLOCK_Y 16
#define THREADS_PER_BLOCK_Z 1
__global__ void
vectoradd_float(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
asm volatile ("s_waitcnt lgkmcnt(0)");
if ( i < (width * height)) {
a[i] = b[i] + c[i];
asm volatile ("s_waitcnt lgkmcnt(0)");
}
asm volatile ("s_waitcnt lgkmcnt(0)");
}
#if 0
__kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
int x = blockDimX * blockIdx.x + threadIdx.x;
int y = blockDimY * blockIdy.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = b[i] + c[i];
}
}
#endif
using namespace std;
int main() {
float* hostA;
float* hostB;
float* hostC;
float* deviceA;
float* deviceB;
float* deviceC;
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
cout << " System major " << devProp.major << endl;
cout << " agent prop name " << devProp.name << endl;
cout << "hip Device prop succeeded " << endl ;
int i;
int errors;
hostA = (float*)malloc(NUM * sizeof(float));
hostB = (float*)malloc(NUM * sizeof(float));
hostC = (float*)malloc(NUM * sizeof(float));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (float)i;
hostC[i] = (float)i*100.0f;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(float)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(float)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(float), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(float), hipMemcpyHostToDevice));
hipLaunchKernelGGL(vectoradd_float,
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
deviceA ,deviceB ,deviceC ,WIDTH ,HEIGHT);
HIP_ASSERT(hipMemcpy(hostA, deviceA, NUM*sizeof(float), hipMemcpyDeviceToHost));
// verify the results
errors = 0;
for (i = 0; i < NUM; i++) {
if (hostA[i] != (hostB[i] + hostC[i])) {
errors++;
}
}
if (errors!=0) {
printf("FAILED: %d errors\n",errors);
} else {
printf ("PASSED!\n");
}
HIP_ASSERT(hipFree(deviceA));
HIP_ASSERT(hipFree(deviceB));
HIP_ASSERT(hipFree(deviceC));
free(hostA);
free(hostB);
free(hostC);
//hipResetDefaultAccelerator();
return errors;
}
==================================================================
其中的hip cuda kernel 转换成 asm为:
# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx906
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx906"
.protected _Z15vectoradd_floatPfPKfS1_ii ; -- Begin function _Z15vectoradd_floatPfPKfS1_ii
.globl _Z15vectoradd_floatPfPKfS1_ii
.p2align 8
.type _Z15vectoradd_floatPfPKfS1_ii,@function
_Z15vectoradd_floatPfPKfS1_ii: ; @_Z15vectoradd_floatPfPKfS1_ii
; %bb.0:
s_load_dword s2, s[4:5], 0x4
s_load_dwordx2 s[0:1], s[6:7], 0x18
s_waitcnt lgkmcnt(0)
s_lshr_b32 s3, s2, 16
s_mul_i32 s9, s9, s3
v_add_u32_e32 v1, s9, v1
v_mul_lo_u32 v1, v1, s0
s_and_b32 s2, s2, 0xffff
s_mul_i32 s8, s8, s2
s_mul_i32 s0, s1, s0
v_add3_u32 v0, s8, v0, v1
v_cmp_gt_i32_e32 vcc, s0, v0
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_2
; %bb.1:
s_load_dwordx2 s[0:1], s[6:7], 0x10
s_load_dwordx2 s[2:3], s[6:7], 0x8
v_ashrrev_i32_e32 v1, 31, v0
v_lshlrev_b64 v[0:1], 2, v[0:1]
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v3, s1
v_add_co_u32_e32 v2, vcc, s0, v0
v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
v_mov_b32_e32 v5, s3
v_add_co_u32_e32 v4, vcc, s2, v0
v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
global_load_dword v6, v[4:5], off
global_load_dword v7, v[2:3], off
s_load_dwordx2 s[0:1], s[6:7], 0x0
s_waitcnt lgkmcnt(0)
v_mov_b32_e32 v2, s1
v_add_co_u32_e32 v0, vcc, s0, v0
v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
s_waitcnt vmcnt(0)
v_add_f32_e32 v2, v6, v7
global_store_dword v[0:1], v2, off
.LBB0_2:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z15vectoradd_floatPfPKfS1_ii
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 32
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_flat_scratch_init 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 1
.amdhsa_next_free_vgpr 8
.amdhsa_next_free_sgpr 10
.amdhsa_reserve_flat_scratch 0
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size _Z15vectoradd_floatPfPKfS1_ii, .Lfunc_end0-_Z15vectoradd_floatPfPKfS1_ii
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 192
; NumSgprs: 12
; NumVgprs: 8
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 1
; VGPRBlocks: 1
; NumSGPRsForWavesPerEU: 12
; NumVGPRsForWavesPerEU: 8
; Occupancy: 10
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 1
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.4.3 23045 a29fe425c7b0e5aba97ed2f95f61fd5ecba68aed)"
.section ".note.GNU-stack"
.addrsig
.amdgpu_metadata
---
amdhsa.kernels:
- .args:
- .address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .access: read_only
.address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .access: read_only
.address_space: global
.offset: 16
.size: 8
.value_kind: global_buffer
- .offset: 24
.size: 4
.value_kind: by_value
- .offset: 28
.size: 4
.value_kind: by_value
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 32
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z15vectoradd_floatPfPKfS1_ii
.private_segment_fixed_size: 0
.sgpr_count: 12
.sgpr_spill_count: 0
.symbol: _Z15vectoradd_floatPfPKfS1_ii.kd
.vgpr_count: 8
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx906
amdhsa.version:
- 1
- 1
...
.end_amdgpu_metadata
# __CLANG_OFFLOAD_BUNDLE____END__ hip-amdgcn-amd-amdhsa-gfx906
对应的hip c语言kernel为:
__global__ void
vectoradd_float(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = b[i] + c[i];
}
}
分别做一下注释:
__global__ void
vectoradd_float(float* __restrict__ a, const float* __restrict__ b, const float* __restrict__ c, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int i = y * width + x;
asm volatile ("s_waitcnt lgkmcnt(0)");
if ( i < (width * height)) {
a[i] = b[i] + c[i];
asm volatile ("s_waitcnt lgkmcnt(0)");
}
asm volatile ("s_waitcnt lgkmcnt(0)");
}