__global__ void matrixMul( float* A, float* B, float* C, int hA,int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Csub = 0.0f;
for (int j=0;j<wA;j+=BLOCK_DIM)
{
__shared__ float AS[BLOCK_DIM][BLOCK_DIM];
__shared__ float BS[BLOCK_DIM][BLOCK_DIM];
if(((by*BLOCK_DIM+ty)<hA)&&((tx+j)<wA))
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Csub = 0.0f;
for (int j=0;j<wA;j+=BLOCK_DIM)
{
__shared__ float AS[BLOCK_DIM][BLOCK_DIM];
__shared__ float BS[BLOCK_DIM][BLOCK_DIM];
if(((by*BLOCK_DIM+ty)<hA)&&((tx+j)<wA))