1. mma的使用
1.1 wmma::fragment
- 第一种方式: mma.h中的wmma::fragment; 这个粒度比较大,如果细粒度的tensor执行则需要使用PTX的方式
#include <cuda.h>
#include <cuda_fp16.h>
#include <mma.h>
#include <stdio.h>
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include "device_types.h"
namespace {
constexpr int kWarpSize = 32;
constexpr int kM = 16;
constexpr int kN = 16;
constexpr int kK = 16;
constexpr int kMTotal = 16 * 64;
constexpr int kNTotal = 16 * 64;
constexpr int kKTotal = 16 * 32;
}
using namespace nvcuda;
__global__ void WmmaF16TensorCore(half* A, half* B, float* C) {
int index_x = (blockIdx.x * blockDim.x + threadIdx.x) / kWarpSize;
int index_y = (blockIdx.y * blockDim.y + threadIdx.y);
wmma::fragment<wmma::matrix_a, kM, kN, kK, half, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, kM, kN, kK, half, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, kM, kN, kK, float> ab_frag;
wmma::fill_fragment(ab_frag, 0.0f);
int a_col, a_row, b_col, b_row;
a_row = index_x * kM;
b_row = index_y * kN;
for (int k = 0; k < kKTotal; k += kK) {
a_col = b_col = k;
if (a_row < kMTotal && b_row < kNTotal) {
wmma::load_matrix_sync