#include "cuda_runtime.h"
#include "cublas_v2.h"
#include "time.h"
#include <iostream>
#include "cublas_v2.h"
#include "time.h"
#include <iostream>
using namespace std;
#define imin(a,b) (a<b?a:b)
const int N = 31 * 1024;
const int threadsPerBlock = 256;
const int blocksperGrid = imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
__global__ void dot(float *a, float *b, float *c)
{
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid<N)
{
temp += a[tid] * b[tid];