转载请注明出处:http://blog.csdn.net/c602273091/article/details/55004823
介绍
之前的硬件篇: http://blog.csdn.net/c602273091/article/details/54728511 介绍了代码加速的平台的基础知识。这一篇主要是以kmeans算法为例子介绍软件层面的代码加速的知识。在软件层面,包括数据结构、算法、软件架构三个重要部分。
在软件方面,需要明确的一个重要的东西就是找到并发部分(concurrency),找到之后把它映射到并行平台。
并发(concurrency) VS 并行(parallelism)
首先要明白并发和并行的区别。并发针对的是应用程序、并行指的是平台并行。并发说的是一个程序是分成几部分并行执行的。并行平台说的就是可以同时执行多个任务的平台。
解决问题的步骤
- 理解当前状态
- 观察内部表示
- 查找可替代的方法
从中选择
以K均值算法为例。k均值就是找到k个中心,数据点距离它最近的中心是同一个中心就是同一类。k个中心的值会不断更新,直到满足条件。
K均值的算法流程如下:
- 初始化K个中心
- 根据数据点到每个中心的距离远近,把数据点分成K类
- 在每个类中计算中值作为新的中点
是否满足迭代条件,不满足的话回到第二步
k均值的代码实现如下:
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* File: seq_kmeans.c (sequential version) */
/* Description: Implementation of simple k-means clustering algorithm */
/* This program takes an array of N data objects, each with */
/* M coordinates and performs a k-means clustering given a */
/* user-provided value of the number of clusters (K). The */
/* clustering results are saved in 2 arrays: */
/* 1. a returned array of size [K][N] indicating the center */
/* coordinates of K clusters */
/* 2. membership[N] stores the cluster center ids, each */
/* corresponding to the cluster a data object is assigned */
/* */
/* Author: Wei-keng Liao */
/* ECE Department, Northwestern University */
/* email: wkliao@ece.northwestern.edu */
/* Copyright, 2005, Wei-keng Liao */
/* */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
// Copyright (c) 2005 Wei-keng Liao
// Copyright (c) 2011 Serban Giuroiu
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
// -----------------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include "kmeans.h"
/*----< euclid_dist_2() >----------------------------------------------------*/
/* square of Euclid distance between two multi-dimensional points */
// 计算两个向量之间的距离
__inline static
float euclid_dist_2(int numdims, /* no. dimensions */
float *coord1, /* [numdims] */
float *coord2) /* [numdims] */
{
int i;
float ans=0.0;
for (i=0; i<numdims; i++)
ans += (coord1[i]-coord2[i]) * (coord1[i]-coord2[i]);
return(ans);
}
// 找到最近的中心
/*----< find_nearest_cluster() >---------------------------------------------*/
__inline static
int find_nearest_cluster(int numClusters, /* no. clusters */
int numCoords, /* no. coordinates */
float *object, /* [numCoords] */
float **clusters) /* [numClusters][numCoords] */
{
int index, i;
float dist, min_dist;
/* find the cluster id that has min distance to object */
index = 0;
min_dist = euclid_dist_2(numCoords, object, clusters[0]);
for (i=1; i<numClusters; i++) {
dist = euclid_dist_2(numCoords, object, clusters[i]);
/* no need square root */
if (dist < min_dist) { /* find the min and its array index */
min_dist = dist;
index = i;
}
}
return(index);
}
/*----< seq_kmeans() >-------------------------------------------------------*/
/* return an array of cluster centers of size [numClusters][numCoords] */
float** seq_kmeans(float **objects, /* in: [numObjs][numCoords] */
int numCoords, /* no. features */
int numObjs, /* no. objects */
int numClusters, /* no. clusters */
float threshold, /* % objects change membership */
int *membership, /* out: [numObjs] */
int *loop_iterations)
{
int i, j, index, loop=0;
int *newClusterSize; /* [numClusters]: no. objects assigned in each
new cluster */
// 计算每个中心有多少个数据
float delta; /* % of objects change their clusters */
// 计算每次迭代过程中类别改变的数据点的平均比例
float **clusters; /* out: [numClusters][numCoords] */
// 输出的K个聚类中心
float **newClusters; /* [numClusters][numCoords] */
/* allocate a 2D space for returning variable clusters[] (coordinates
of cluster centers) */
// 这种开辟二维数组的方法很特别
clusters = (float**) malloc(numClusters * sizeof(float*));
assert(clusters != NULL);
clusters[0] = (float*) malloc(numClusters * numCoords * sizeof(float));
assert(clusters[0] != NULL);
for (i=1; i<numClusters; i++)
clusters[i] = clusters[i-1] + numCoords;
/* pick first numClusters elements of objects[] as initial cluster centers*/
for (i=0; i<numClusters; i++)
for (j=0; j<numCoords; j++)
clusters[i][j] = objects[i][j];
/* initialize membership[] */
for (i=0; i<numObjs; i++) membership[i] = -1;
/* need to initialize newClusterSize and newClusters[0] to all 0 */
newClusterSize = (int*) calloc(numClusters, sizeof(int));
assert(newClusterSize != NULL);
newClusters = (float**) malloc(numClusters * sizeof(float*));
assert(newClusters != NULL);
newClusters[0] = (float*) calloc(numClusters * numCoords, sizeof(float));
assert(newClusters[0] != NULL);
for (i=1; i<numClusters; i++)
newClusters[i] = newClusters[i-1] + numCoords;
do {
delta = 0.0;
// 对数据点找到它们的类别,计算每个类别的每一维的向量之和用于后期计算新的中心
for (i=0; i<numObjs; i++) {
/* find the array index of nestest cluster center */
index = find_nearest_cluster(numClusters, numCoords, objects[i],
clusters);
/* if membership changes, increase delta by 1 */
if (membership[i] != index) delta += 1.0;
/* assign the membership to object i */
membership[i] = index;
/* update new cluster centers : sum of objects located within */
newClusterSize[index]++;
for (j=0; j<numCoords; j++)
newClusters[index][j] += objects[i][j];
}
/* average the sum and replace old cluster centers with newClusters */
for (i=0; i<numClusters; i++) {
for (j=0; j<numCoords; j++) {
if (newClusterSize[i] > 0)
clusters[i][j] = newClusters[i][j] / newClusterSize[i];
newClusters[i][j] = 0.0; /* set back to 0 */
}
newClusterSize[i] = 0; /* set back to 0 */
}
delta /= numObjs;
} while (delta > threshold && loop++ < 500);
// 终止条件就是数据的类别变化没有很多或者迭代次数超过500次
*loop_iterations = loop + 1;
free(newClusters[0]);
free(newClusters);
free(newClusterSize);
return clusters;
}
了解当前状态
平台、资源、目标(结果要正确)、满足需求(速度要快)。
平台:Linux + GCC, 86多核处理器
资源:2~8核,2~8路SIMD。
存储:32K L1缓存、256K L2缓存。
K均值中第二三步比较耗时间,这里面需要注意的变量就是特征的维度D、样本数N、中心个数K都会影响速度。
内在表示
把算法过程中的重要数据结构表示出来。
一开始是N*D个样本,有K*D个中心,要得到的是N个样本的类别N*1。
找到替代办法
替代办法就是把可并发部分对应到平行平台或者并行化某些操作。
映射到平台的方法有:SIMD、OpenMP
在K均值中比如计算距离,比如计算加减法之类的运算。
从替代办法中选择符合的
首先是结果是否正确,然后才是效率问题。
具体的标准有:
- Efficiency: Runs quickly, makes good use of computational resources
- Simplicity: Easy to understand code is easier to develop, debug, verify and modify
- Portability: Should run on widest range of parallel computers
- Scalability: Should be effective on a wide range of processing elements
— Other considerations: Practicality, Hardware, Engineering cost
其实这个步骤看起来很虚,所以需要实践。具体做法就是把上面的k均值改成OpenMP的加速代码。下面的代码也只是1x的加速,这当然还不够,实际上还可以做很多工作。
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* File: kmeans_clustering.c (OpenMP version) */
/* Description: Implementation of simple k-means clustering algorithm */
/* This program takes an array of N data objects, each with */
/* M coordinates and performs a k-means clustering given a */
/* user-provided value of the number of clusters (K). The */
/* clustering results are saved in 2 arrays: */
/* 1. a returned array of size [K][N] indicating the center */
/* coordinates of K clusters */
/* 2. membership[N] stores the cluster center ids, each */
/* corresponding to the cluster a data object is assigned */
/* */
/* Author: Wei-keng Liao */
/* ECE Department, Northwestern University */
/* email: wkliao@ece.northwestern.edu */
/* Copyright, 2005, Wei-keng Liao */
/* */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include "kmeans.h"
/*----< euclid_dist_2() >----------------------------------------------------*/
/* square of Euclid distance between two multi-dimensional points */
__inline static
float euclid_dist_2(int numdims, /* no. dimensions */
float *coord1, /* [numdims] */
float *coord2) /* [numdims] */
{
int i;
float ans=0.0;
for (i=0; i<numdims; i++)
ans += (coord1[i]-coord2[i]) * (coord1[i]-coord2[i]);
return(ans);
}
/*----< find_nearest_cluster() >---------------------------------------------*/
__inline static
int find_nearest_cluster(int numClusters, /* no. clusters */
int numCoords, /* no. coordinates */
float *object, /* [numCoords] */
float **clusters) /* [numClusters][numCoords] */
{
int index, i;
float dist, min_dist;
/* find the cluster id that has min distance to object */
index = 0;
min_dist = euclid_dist_2(numCoords, object, clusters[0]);
for (i=1; i<numClusters; i++) {
dist = euclid_dist_2(numCoords, object, clusters[i]);
/* no need square root */
if (dist < min_dist) { /* find the min and its array index */
min_dist = dist;
index = i;
}
}
return(index);
}
/*----< kmeans_clustering() >------------------------------------------------*/
/* return an array of cluster centers of size [numClusters][numCoords] */
float** omp_kmeans(int is_perform_atomic, /* in: */
float **objects, /* in: [numObjs][numCoords] */
int numCoords, /* no. coordinates */
int numObjs, /* no. objects */
int numClusters, /* no. clusters */
float threshold, /* % objects change membership */
int *membership) /* out: [numObjs] */
{
int i, j, k, index, loop=0;
int *newClusterSize; /* [numClusters]: no. objects assigned in each
new cluster */
float delta; /* % of objects change their clusters */
float **clusters; /* out: [numClusters][numCoords] */
float **newClusters; /* [numClusters][numCoords] */
double timing;
int nthreads; /* no. threads */
int **local_newClusterSize; /* [nthreads][numClusters] */
float ***local_newClusters; /* [nthreads][numClusters][numCoords] */
nthreads = omp_get_max_threads();
/* allocate a 2D space for returning variable clusters[] (coordinates
of cluster centers) */
clusters = (float**) malloc(numClusters * sizeof(float*));
assert(clusters != NULL);
clusters[0] = (float*) malloc(numClusters * numCoords * sizeof(float));
assert(clusters[0] != NULL);
for (i=1; i<numClusters; i++)
clusters[i] = clusters[i-1] + numCoords;
/* pick first numClusters elements of objects[] as initial cluster centers*/
for (i=0; i<numClusters; i++)
for (j=0; j<numCoords; j++)
clusters[i][j] = objects[i][j];
/* initialize membership[] */
for (i=0; i<numObjs; i++) membership[i] = -1;
/* need to initialize newClusterSize and newClusters[0] to all 0 */
newClusterSize = (int*) calloc(numClusters, sizeof(int));
assert(newClusterSize != NULL);
newClusters = (float**) malloc(numClusters * sizeof(float*));
assert(newClusters != NULL);
newClusters[0] = (float*) calloc(numClusters * numCoords, sizeof(float));
assert(newClusters[0] != NULL);
for (i=1; i<numClusters; i++)
newClusters[i] = newClusters[i-1] + numCoords;
if (!is_perform_atomic) {
/* each thread calculates new centers using a private space,
then thread 0 does an array reduction on them. This approach
should be faster */
local_newClusterSize = (int**) malloc(nthreads * sizeof(int*));
assert(local_newClusterSize != NULL);
local_newClusterSize[0] = (int*) calloc(nthreads*numClusters,
sizeof(int));
assert(local_newClusterSize[0] != NULL);
for (i=1; i<nthreads; i++)
local_newClusterSize[i] = local_newClusterSize[i-1]+numClusters;
/* local_newClusters is a 3D array */
local_newClusters =(float***)malloc(nthreads * sizeof(float**));
assert(local_newClusters != NULL);
local_newClusters[0] =(float**) malloc(nthreads * numClusters *
sizeof(float*));
assert(local_newClusters[0] != NULL);
for (i=1; i<nthreads; i++)
local_newClusters[i] = local_newClusters[i-1] + numClusters;
for (i=0; i<nthreads; i++) {
for (j=0; j<numClusters; j++) {
local_newClusters[i][j] = (float*)calloc(numCoords,
sizeof(float));
assert(local_newClusters[i][j] != NULL);
}
}
}
if (_debug) timing = omp_get_wtime();
do {
delta = 0.0;
if (is_perform_atomic) {
#pragma omp parallel for \
private(i,j,index) \
firstprivate(numObjs,numClusters,numCoords) \
shared(objects,clusters,membership,newClusters,newClusterSize) \
schedule(static) \
reduction(+:delta)
for (i=0; i<numObjs; i++) {
/* find the array index of nestest cluster center */
index = find_nearest_cluster(numClusters, numCoords, objects[i],
clusters);
/* if membership changes, increase delta by 1 */
if (membership[i] != index) delta += 1.0;
/* assign the membership to object i */
membership[i] = index;
/* update new cluster centers : sum of objects located within */
#pragma omp atomic
newClusterSize[index]++;
for (j=0; j<numCoords; j++)
#pragma omp atomic
newClusters[index][j] += objects[i][j];
}
}
else {
#pragma omp parallel \
shared(objects,clusters,membership,local_newClusters,local_newClusterSize)
{
int tid = omp_get_thread_num();
#pragma omp for \
private(i,j,index) \
firstprivate(numObjs,numClusters,numCoords) \
schedule(static) \
reduction(+:delta)
for (i=0; i<numObjs; i++) {
/* find the array index of nestest cluster center */
index = find_nearest_cluster(numClusters, numCoords,
objects[i], clusters);
/* if membership changes, increase delta by 1 */
if (membership[i] != index) delta += 1.0;
/* assign the membership to object i */
membership[i] = index;
/* update new cluster centers : sum of all objects located
within (average will be performed later) */
local_newClusterSize[tid][index]++;
for (j=0; j<numCoords; j++)
local_newClusters[tid][index][j] += objects[i][j];
}
} /* end of #pragma omp parallel */
/* let the main thread perform the array reduction */
for (i=0; i<numClusters; i++) {
for (j=0; j<nthreads; j++) {
newClusterSize[i] += local_newClusterSize[j][i];
local_newClusterSize[j][i] = 0.0;
for (k=0; k<numCoords; k++) {
newClusters[i][k] += local_newClusters[j][i][k];
local_newClusters[j][i][k] = 0.0;
}
}
}
}
/* average the sum and replace old cluster centers with newClusters */
for (i=0; i<numClusters; i++) {
for (j=0; j<numCoords; j++) {
if (newClusterSize[i] > 1)
clusters[i][j] = newClusters[i][j] / newClusterSize[i];
newClusters[i][j] = 0.0; /* set back to 0 */
}
newClusterSize[i] = 0; /* set back to 0 */
}
delta /= numObjs;
} while (delta > threshold && loop++ < 500);
if (_debug) {
timing = omp_get_wtime() - timing;
printf("nloops = %2d (T = %7.4f)",loop,timing);
}
if (!is_perform_atomic) {
free(local_newClusterSize[0]);
free(local_newClusterSize);
for (i=0; i<nthreads; i++)
for (j=0; j<numClusters; j++)
free(local_newClusters[i][j]);
free(local_newClusters[0]);
free(local_newClusters);
}
free(newClusters[0]);
free(newClusters);
free(newClusterSize);
return clusters;
}