问题描述:
#include <stdio.h>
#include<iostream>
#include <time.h>
using namespace std;
//求R的模长
float length(float *a)
{
float sum = 0;
for (int i = 0; i < 8192; i++) {
sum += a[i] * a[i];
}
return sqrt(sum);
}
//最大值
float qmax(float **b) {
float max = 0;
for (int i = 0; i < 8192; i++)
for (int j = 0; j < 8192; j++) {
if (b[i][j] > max)max = b[i][j];
}
return max;
}
//最小值
float qmin(float **b) {
float min = 0;
for (int i = 0; i < 8192; i++)
for (int j = 0; j < 8192; j++) {
if (b[i][j] < min)min = b[i][j];
}
return min;
}
int main() {
float *a;
float max, min;
float **b;
clock_t start, stop;
double duration;
start = clock();
a = (float*)malloc(8192 * sizeof(float));
b = (float**)malloc(8192 * sizeof(float*));
for (int i = 0; i < 8192; ++i)
{
b[i] = (float *)malloc(sizeof(float) * 8192);
}
for (int i = 0; i < 8192; i++)
{
a[i] = rand() % 41;
//printf("%d\n",rand()%41);
}
float len=length(a);
printf("||R||=%f\n", len);
//将两个矩阵相乘后放入二维数组
for(int i=0;i<8192;i++)
for(int j=0;j<8192;j++){
b[i][j]=(float)a[i]*a[j]/len;
}
//打印二维数组
/* for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++) {
printf("%f\t", b[i][j]);
}
printf("\n");
}*/
//求二维数组最大值与最小值
max = qmax(b);
min = qmin(b);
printf("----------------------------------------------\n");
printf("最终的最大值=%f\n",max);
printf("最终的最小值=%f\n", min);
stop = clock();
duration = ((double)(stop - start)) / CLK_TCK;
printf("cpu程序运行的时间=%lf", duration);
return 0;
}
二.gpu实现
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
__global__ void calcSum(float* AA, float* CC, int len)
{
int Width = 8192;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
CC[Row * Width + Col] = (float)AA[Row] * AA[Col] / len;
//CC[1] = 11;
}
__global__ void maxMin(float* CC)
{
int Width = 8192;
unsigned int tid = threadIdx.x;
int Row = blockIdx.x * blockDim.x + threadIdx.x;
for (unsigned int stride = 33554432; stride > 0; stride >>= 1)
{
__syncthreads();
if (Row< stride&& CC[Row + stride] > CC[Row])
CC[Row] = CC[Row + stride];
}
}
__global__ void maxMin1(float* CC)
{
int Width = 8192;
unsigned int tid = threadIdx.x;
int Row = blockIdx.x * blockDim.x + threadIdx.x;
for (unsigned int stride = 33554432; stride > 0; stride >>= 1)
{
__syncthreads();
if (Row < stride&& CC[Row + stride] < CC[Row])
CC[Row] = CC[Row + stride];
}
}
//求模长
float length(float *a)
{
float sum = 0;
for (int i = 0; i < 8192; i++) {
sum += a[i] * a[i];
}
return sqrt(sum);
}
int main()
{
cudaError_t cudaStatus = cudaSuccess;
int szA = 8192;
int szC = 8192 * 8192;
float* A, *AA;
float* C, *CC;
clock_t start, stop;
double duration;
start = clock();
A = (float*)malloc(szA * sizeof(float));
C = (float*)malloc(szC * sizeof(float));
int i, j;
for (int i = 0; i < 8192; i++)
{
A[i] = rand() % 41;
//printf("%d\n",rand()%41);
}
printf("\nArray A的结果:\n");
for (i = 0; i < 8; i++) {
printf("%f\t", A[i]);
printf("\n");
}
cudaStatus = cudaMalloc((void**)&AA, szA * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc1 failed!");
}
cudaStatus = cudaMalloc((void**)&CC, szC * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc3 failed!");
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(AA, A, szA * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy1 failed!");
}
float len = length(A);
printf("len值:%f", len);
dim3 dimGrid(256, 256, 1);
dim3 dimBlock(32, 32, 1);
calcSum << <dimGrid, dimBlock >> > (AA, CC, len);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "calcSum failed!");
return 1;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(C, CC, szC * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
printf("\nArray C的结果:\n");
for (i = 0; i < 8; i++) {
printf("%f\t", C[i]);
printf("\n");
}
//求最大值最小值过程
cudaStatus = cudaMemcpy(CC, C, szC * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy1 failed!");
}
dim3 dimGrid1 = 65536;
dim3 dimBlock1 = 1024;
maxMin << <dimGrid1, dimBlock1 >> > (CC);
cudaStatus = cudaMemcpy(C, CC, szC * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
printf("最大值\n%f\n", C[0]);
//求最小值过程
cudaStatus = cudaMemcpy(CC, C, szC * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy1 failed!");
}
dim3 dimGrid2 = 65536;
dim3 dimBlock2 = 1024;
maxMin1 << <dimGrid2, dimBlock2 >> > (CC);
cudaStatus = cudaMemcpy(C, CC, szC * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
printf("最小值\n%f\n", C[0]);
cudaFree(AA);
cudaFree(CC);
free(A);
free(C);
stop = clock();
duration = ((double)(stop - start));
printf("gpu程序运行的时间=%lf", duration);
return 0;
}
试验结果: