大数据与算法——累加求和、求最大值、排序加速
大数据与算法作业加速代码实现及数据整理对比
1.加速代码实现
1.1 多线程
1.1.1初始准备
//线程相关定义:
int ThreadID[MAX_THREADS]; //线程ID
float floatResults[MAX_THREADS]; //每个线程的中间结果
for (size_t i = 0; i < MAX_THREADS; i++)
{
threadresult[i] = 0.0f;
}
1.1.2创建线程并唤醒
DWORD WINAPI ThreadProc1(LPVOID lpParameter) //线程函数,在创建时指定
{
//while (1)
{
//mutex,semphone
int who = *(int*)lpParameter; //根据线程ID指定数组起始点
//cout << who << endl;
int startIndex = who * SUBDATANUM;
int endIndex = startIndex + SUBDATANUM;
double u = 0.0f;
for (int i = startIndex; i < endIndex; i++)
{
u += log(sqrt(rawFloatData[i] / 4.0));
}
floatResults[who] = u;
}
return 0;
}
HANDLE hThreads[MAX_THREADS];
for (int i = 0; i < MAX_THREADS; i++)
{
ThreadID[i] = i;
floatResults[i] = 0.0f;
hThreads[i] = CreateThread(
NULL,// default security attributes
0,// use default stack size
ThreadProc1,// thread function
&ThreadID[i],// argument to thread function
CREATE_SUSPENDED, // use default creation flags.0 means the thread will be run at once CREATE_SUSPENDED
NULL);
}
for (int i = 0; i < MAX_THREADS; i++)
{
ResumeThread(hThreads[i]);
}
1.1.3等待全部线程结束,并处理数据
float result = 0.0f;
int sumSpeedUp_mul(const float data[], const int len, float &result)
//data是原始数据,len为长度。结果通过函数返回
{
double localresult = 0.0f;
#pragma omp parallel for reduction(+:localresult)
for (int i = 0; i < len; i++)
{
localresult += data[i];//有意浪费时间,代码不严格
}
result = localresult;
return 0;
}
WaitForMultipleObjects(MAX_THREADS, hThreads, TRUE, INFINITE);
float floatSum = 0.0f;
sumSpeedUp_mul(floatResults, MAX_THREADS, floatSum);
QueryPerformanceCounter(&end);//start
for (int i = 0; i < MAX_THREADS; i++)
{
CloseHandle(hThreads[i]); // Close all thread handles upon completion.
}
注:
1.在创建线程时可通过修改参数,在创建后直接打开,此项目中选择在创建不立即打开,而在后面统一唤醒,便于时间计算;
2.int who = (int)lpParameter;此句代码的含义是将void类指针转为int指针;
3.多线程加速本质上就是将数据分为64部分,对每一部分进行操作,最后将数据统一;
4.等待全部线程结束用waitfor函数;
5.关闭线程应放在时间计算之后;
上述代码是针对于累加求和算法,接下来分别阐述另两种算法的区别
求最大值
在原有基础上修改即可,
1.在线程函数threadproc中修改:
for (int i = startIndex; i < endIndex; i++)
{
if (threadresult[who] <= log(sqrt(rawFloatData[i] / 4.0)))
threadresult[who] = log(sqrt(rawFloatData[i] / 4.0));
}
floatResults[who] = threadresult[who];
2.将sumSpeedUp_mul函数修改为maxSpeedUp_mul函数,并调用
int maxSpeedUp_mul(const float data[], const int len, float &result)
//data是原始数据,len为长度。结果通过函数返回
{
double localresult = 0.0f;
for (int i = 0; i < len; i++)
{
if (localresult <= data[i])
localresult = data[i];
}
result = localresult;
return 0;
}
排序
1.修改线程函数
在循环中调用快速排序(或其他排序)函数
MyquickSort(startIndex, endIndex - 1, rawFloatData2);
2.由于排序函数会修改数组的顺序,如果想要保证顺序不变,则需要将原有数组copy出来,且在最终处理数据时需要部分有序的数组归并为一个完整数组,代码如下
//将多线程排序结果合并
void makeup(float data[], int len)
{
for (int j = 5; j >= 0; j--)
{
int m = pow(2, j);
#pragma omp parallel for
for (int i = 0; i < m; i++)
{
int start = i * SUBDATANUM * pow(2, 6 - j); int end = (i + 1) * SUBDATANUM * pow(2, 6 - j) - 1;
int start1 = i * SUBDATANUM * pow(2, 6 - j); int end1 = (start + end + 1) / 2 - 1;
int start2 = end1 + 1; int end2 = end;
int k = start;
//两个序列一一比较,哪的序列的元素小就放进reg序列里面,然后位置+1再与另一个序列原来位置的元素比较
//如此反复,可以把两个有序的序列合并成一个有序的序列
while (start1 <= end1 && start2 <= end2)
reg[k++] = data[start1] < data[start2] ? data[start1++] : data[start2++];
//然后这里是分情况,如果arr2序列的已经全部都放进reg序列了然后跳出了循环
//那就表示arr序列还有更大的元素(一个或多个)没有放进reg序列,所以这一步就是接着放
while (start1 <= end1)
reg[k++] = data[start1++];
while (start2 <= end2)
reg[k++] = data[start2++];
//把已经有序的reg序列放回arr序列中
#pragma omp parallel for
for (k = start; k <= end; k++)
data[k] = reg[k];
}
}
}
1.2 OpenMP加速
注:
1.由于openmp为并行加速,加速排序后会出现乱序,故对排序无法实现加速;
2.openmp是针对于循环的加速,可在满足要求的循环前加上即可,代码主体仍为原不加速代码;
3.对于累加算法,可直接对循环中的运算符“+”实现并行加速,详情可见下代码
举例如下
求和
int sumSpeedUp_omp(const float data[], const int len, float& result)
//data是原始数据,len为长度。结果通过函数返回
{
double localresult = 0.0f;
#pragma omp parallel for reduction(+:localresult)
for (int i = 0; i < len; i++)
{
localresult += log(sqrt(data[i] / 4.0));//有意浪费时间,代码不严格
}
result = localresult;
return 0;
}
求最值
int maxSpeedUp_omp(const float data[], const int len, float& result)
{
double localresult = 0.0f;
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
if ((localresult - log(sqrt(data[i] / 4.0))) < 0)
localresult = log(sqrt(data[i] / 4.0)); //有意浪费时间
}
result = localresult;
return 0;
}
1.3 SSE指令集加速
注:
1.SSE指令集本质也为并行处理多个数据,不同的数据并行加速不同;
2.与上述原理相同,故SSE指令集也难以对排序算法加速(可能是我理解不够);
3.关键指令有load、log、sqrt、div、add、store、cmple,可自行学习;
4.指令后缀pd代表double类型操作,ps代表float操作;
求和
int sumSpeedUp_sse(double data[], const int len, float &result)
{
double localresult = 0.0f;
__declspec(align(16)) double ci[2]; ci[0] = 4.0f; ci[1] = 4.0f;
__declspec(align(16)) double di[2]; di[0] = 0.0f; di[1] = 0.0f;
__m128d c = _mm_load_pd(ci); //除法
__m128d d = _mm_load_pd(di);
int nb_iters = len / 2;
__m128d* ptr = (__m128d*)data;
for (int i = 0; i < nb_iters; i++, ++ptr, data += 2)
{
__m128d f = _mm_log_pd(_mm_sqrt_pd(_mm_div_pd(*ptr, c)));
d = _mm_add_pd(f, d);
}
_mm_store_pd(middleresult, d);
localresult = middleresult[0] + middleresult[1];
result = localresult;
return 0;
}
求最值
int maxSpeedUp_sse(double data[], const int len, float &result)
{
double localresult = 0.0f;
__declspec(align(16)) double ci[2]; ci[0] = 4.0f; ci[1] = 4.0f;
__declspec(align(16)) double di[2]; di[0] = 0.0f; di[1] = 0.0f;
__declspec(align(16)) double ei[2]; ei[0] = 0.0f; ei[1] = 0.0f;
__declspec(align(16)) double fi[2]; fi[0] = 0.0f; fi[1] = 0.0f;
__m128d c = _mm_load_pd(ci); //除法
__m128d d = _mm_load_pd(di);
__m128d e = _mm_load_pd(ei);
int nb_iters = len / 2;
__m128d* ptr = (__m128d*)data;
for (int i = 0; i < nb_iters; i++, ++ptr, data += 2)
{
__m128d f = _mm_log_pd(_mm_sqrt_pd(_mm_div_pd(*ptr, c)));
_mm_store_pd(ei, _mm_cmple_pd(d, f));
if (ei[1] == 0)
_mm_store_pd(fi, d);
else
_mm_store_pd(fi, f);
if (ei[0] == 0)
_mm_storel_pd(fi, d);
else
_mm_storel_pd(fi, f);
d = _mm_load_pd(fi);
}
if(fi[0] < fi[1])
localresult = fi[1];
else
localresult = fi[0];
result = localresult;
return 0;
}
1.4 Cuda加速
初始化
#define Row 8000
#define Col 8000 //定义计算的二维数组结构为8000*8000,为总数据的一半
//CUDA加速的基本初始化内容
//定义cuda和cpu两部分的计算变量
float** A = (float**)malloc(sizeof(float*) * Row);
float** C = (float**)malloc(sizeof(float*) * Row);
float* dataC = (float*)malloc(sizeof(float) * Row * Col);
//float* dataA = (float*)malloc(sizeof(float) * Row * Col);
float** d_A;
float** d_C;
float* d_dataA;
float* d_dataC;
//cuda端申请存储空间
cudaMalloc((void**)&d_A, sizeof(float**) * Row);
cudaMalloc((void**)&d_C, sizeof(float**) * Row);
cudaMalloc((void**)&d_dataA, sizeof(float) * Row * Col);
cudaMalloc((void**)&d_dataC, sizeof(float) * Row * Col);
//将主机指针A指向设备数据位置,目的是让设备二级指针能够指向设备数据一级指针
//建立A和dataA的对应关系
#pragma omp parallel for
for (int i = 0; i < Row; i++) {
A[i] = d_dataA + Col * i;
C[i] = d_dataC + Col * i;
}
累加操作
void sumCuda(float** A, float** C, float* dataC, float** d_A, float** d_C, float* d_dataA, float* d_dataC, float& Sum)
{
// host向device端传输数据
cudaMemcpy(d_A, A, sizeof(float*) * Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, sizeof(float*) * Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataA, rawFloatData, sizeof(float) * Row * Col, cudaMemcpyHostToDevice);
dim3 threadPerBlock(32, 32);
dim3 blockNumber((Col + threadPerBlock.x - 1) / threadPerBlock.x, (Row + threadPerBlock.y - 1) / threadPerBlock.y);
printf("Block(%d,%d) Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
//调用cuda进行浮点数计算
cudacalculate << <blockNumber, threadPerBlock >> > (d_C, d_A);
//拷贝计算数据-一级数据指针
cudaMemcpy(dataC, d_dataC, sizeof(float) * Row * Col, cudaMemcpyDeviceToHost);
//将cuda的计算结果由device传输回host
//输出计算结果
double sum = 0.0f;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < Row * Col; i++)
{
sum += dataC[i];
}
float fsum = (float)sum;
Sum = fsum;
}
求最值操作
void maxCuda(float** A, float** C, float* dataC, float** d_A, float** d_C, float* d_dataA, float* d_dataC, float& Max)
{
// host向device端传输数据
cudaMemcpy(d_A, A, sizeof(float*) * Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, sizeof(float*) * Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataA, rawFloatData, sizeof(float) * Row * Col, cudaMemcpyHostToDevice);
dim3 threadPerBlock(32, 32);
dim3 blockNumber((Col + threadPerBlock.x - 1) / threadPerBlock.x, (Row + threadPerBlock.y - 1) / threadPerBlock.y);
printf("Block(%d,%d) Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
//调用cuda进行浮点数计算
cudacalculate << <blockNumber, threadPerBlock >> > (d_C, d_A);
//拷贝计算数据-一级数据指针
cudaMemcpy(dataC, d_dataC, sizeof(float) * Row * Col, cudaMemcpyDeviceToHost);
//将cuda的计算结果由device传输回host
//输出计算结果
float max = 0.0f;
#pragma omp parallel for
for (int i = 0; i < Row * Col; i++)
if (max < dataC[i])
max = dataC[i];
Max = max;
}
其中的cudacalculate函数如下
__global__ void cudacalculate(float** C, float** A)
//定义cuda加速计算函数cudacalculate
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
//定义线程的线程编号
int idy = threadIdx.y + blockDim.y * blockIdx.y;
if (idx < Col && idy < Row) {
C[idy][idx] = log(sqrt(A[idy][idx] / 4));
//定义矩阵对位求开平方取log的指定运算操作
}
}
2.数据交互整理
本作业的要求是通过两台计算机实现64×2000000的累加、最值、排序算法加速,故每台计算机为64×1000000数据,在数据初始化时修改即可
2.1 通信逻辑
1.首先两台计算机分别进行数据初始化(数组);
2.数据初始化完成后,进行socket连接初始化;
3.三次握手实现后,由server端向client端发送“您是辅机”信息,保证连接完成,收到后主机打印出“服务器连接到客户机”信息;
4.连接完成后,由主机读取测量数据的模式(存储在mode中),并将模式发送到辅机,在辅机接收到后开始加速。即从此开始时间的计算
5.两台计算机根据测量模式,对数据实现算法操作,操作完成后由辅机将数据传回,主机将传回数据整理后,最终输出。注:在数据整理完成后就应该结束时间计算,防止误差
2.2 部分通信代码补充
socket连接初始化可见上一篇博客,这里从通信逻辑的第四条开始
2.2.1 测量数据模式
server 端
//选择测量类型
cout << "请选择测量数据(求和1,最大值2,排序3)" << endl;
cin >> mode;
服务器与客户端传送信息
ZeroMemory(buf, BUF_SIZE);
_itoa_s(mode, buf, 10);
retVal = send(sClient, buf, BUF_SIZE, 0);
clinet端
//获取测量模式
float result = 0.0f;
ZeroMemory(buf, BUF_SIZE);
retVal = recv(sHost, buf, sizeof(buf), 0);
mode = atoi(buf); //atoi函数实现字符串到整形数据的转变
2.2.2 结果传输
累加或最值
server 端
接收结果
void recvfrom(SOCKET sClient, float& result, int& a, int event)
{
ZeroMemory(buf, BUF_SIZE); //清空接收缓冲区
retVal = recv(sClient, buf, BUF_SIZE, 0); //从client端接收数据到缓冲区buf中
//printf("从client端接收到如下信息%s\n", buf);
sscanf_s(buf, " %f ", &result);
a = int(result);
if (event == 1)
cout << "辅机Cuda加速求和结果 " << a << endl;
else if (event == 2)
cout << "辅机Cuda加速求最大值结果 " << buf << endl;
}
client端
//发送已有数据
void sendto(float result, int retVal, SOCKET sHost)
{
ZeroMemory(buf, BUF_SIZE);
sprintf_s(buf, "%f", result);
retVal = send(sHost, buf, strlen(buf), 0);
//printf("信息发送成功!\n");
}
排序
注:
1.排序算法结果由于需要将数组每一个数据都要传输,故需要大量循环;
2.根据电脑选择不同缓冲区(通过自行实验);
3.由于每次数据都需要整形和字符型的转换,故传输时间较长,占据绝大部分时间
4.传输大量数据时,由于需要保证数据传输不粘连、不缺失,需要在传输成功首先回报信息后再进行下一次数据传输
5.在排序结果传输成功后,应该再次通过归并(与之前的归并不同,需要重新写),将数据整理
server端
//接收排序结果
void recvsort(SOCKET sClient)
{
int startindex, endindex;
int k = 0;
for (int i = 0; i < (64*1000000)/BUF; i++)
{
retVal = recv(sClient, buf, BUF_SIZE, 0);
startindex = i * BUF;
endindex = startindex + BUF;
k = 0;
char temp[4];
for (int j = startindex; j < endindex; j++)
{
for (int n = 0; n < 4; n++)
{
temp[n] = buf[n + 4*k];
}
string str = temp;
rawFloatData2[j] = atof(str.c_str());
k++;
}
ZeroMemory(buf, BUF_SIZE);
const char buf1[] = "ok";
strcpy_s(buf, buf1);
retVal = send(sClient, buf, BUF_SIZE, 0);
}
}
client端
//发送排序结果
void sendsort(SOCKET sHost, int retVal, int sortresult)
{
//ZeroMemory(buf, BUF_SIZE);
int startindex, endindex, k = 0;
for (int i = 0; i < (64*1000000)/BUF; i++)
{
ZeroMemory(buf, BUF_SIZE);
startindex = i * BUF;
endindex = startindex + BUF;
k = 0;
for (int j = startindex; j < endindex; j++)
{
string str = to_string(rawFloatData[j]);
strcpy(&buf[k * 4], str.c_str());
k++;
}
retVal = send(sHost, buf, BUF_SIZE, 0);
retVal = recv(sHost, buf, BUF_SIZE, 0);
}
}
双机排序结果归并
void makeup(float data[], int len, int sign)
{
int start = 0;
int end = DATANUM - 1;
int start1 = start; int end1 = (start + end + 1) / 2 - 1;
int start2 = end1 + 1; int end2 = end;
int k = start;
//两个序列一一比较,哪的序列的元素小就放进reg序列里面,然后位置+1再与另一个序列原来位置的元素比较
//如此反复,可以把两个有序的序列合并成一个有序的序列
while (start1 <= end1 && start2 <= end2)
reg[k++] = data[start1] < data[start2] ? data[start1++] : data[start2++];
//然后这里是分情况,如果arr2序列的已经全部都放进reg序列了然后跳出了循环
//那就表示arr序列还有更大的元素(一个或多个)没有放进reg序列,所以这一步就是接着放
while (start1 <= end1)
reg[k++] = data[start1++];
while (start2 <= end2)
reg[k++] = data[start2++];
//把已经有序的reg序列放回arr序列中
#pragma omp parallel for
for (k = start; k <= end; k++)
data[k] = reg[k];
}
3.加速结果展示
累加加速比:29.14
最值加速比:29.30
排序加速比(这里贴出的是单机加速):23.3