说明
A矩阵为M * N,B矩阵为N * M
代码
#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
#define BLOCK_SIZE 2
#define N 4
#define M 8
__device__ float GetElement(const Matrix A, int row, int col) {
return A.elements[row * A.stride + col];
}
__device__ void SetElement(Matrix A, int row, int col, float value) {
A.elements[row * A.stride + col] = value;
}
__device__ Matrix GetSubMatrix(Matrix A,