为了改善快排并行度的问题,我设计了如下改进算法:
用第一个元素做pivot,其余元素平均分配给可用线程,并行进行局部的快排(都与pivot做比较)。在作完比较之后,使用额外空间将所有数据归并到一起(使用memcpy)。进行下一次迭代。
然后就出现了一系列的问题,其一就是for循环中声明的变量,在一次迭代结束后会被释放变量所占用内存(貌似是个栈),如果传递指向这块内存的指针,会出错。而且for循环基本内存同一位置放相同类型的变量,调试会很尴尬。
// newQsort.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <iostream>
#include <pthread.h>
#include <cmath>
using namespace std;
double *array,*buffer;
pthread_mutex_t outlock=PTHREAD_MUTEX_INITIALIZER;
typedef struct para
{
long as;
long ae;
double pivot;
long *bs;
long *be;
pthread_mutex_t *lock;
}para;
/*
traditional quick sort
*/
void qsort(long start,long end)
{
//cout<<"ite"<<endl;
if(start>end-2)
{
return ;
}
double pivot=array[start];
long i=start-1;
for(long j=start;j<end;++j)
{
if(array[j]<pivot)
{
++i;
double tmp=array[i];
array[i]=array[j];
array[j]=tmp;
}
}
array[i+1]=pivot;
qsort(start,i+1);
qsort(i+2,end);
}
/*
sort the [@start,@end) part of the array(g)
use the @pivot as pivot
return last index of lt pivot item
*/
long nqsort(long start,long end,double pivot)
{
if(start>end-2)
{
//cout<<"e"<<endl;
if(array[start]>pivot)
{
return start-1;
}
else
{
return start;
}
}
//from <introduction to algorthms>
long i=start-1;
double tmp;
for(long j=start;j<end;++j)
{
if(array[j]<pivot)
{
++i;
tmp=array[i];
array[i]=array[j];
array[j]=tmp;
}
}
return i;
}
/*
merge several middle result into one
put lt pivot items after @*start @i+1 in total
put gt pivot items before @*end @ae-@as-@i-1 in total
change @*start @*end to correct position
use buffer(g) as temp space
*/
void mergeInto(long *start,long *end,long i,long as,long ae)
{
//cout<<*start<<" "<<*end<<" "<<i<<" "<<as<<" "<<ae<<endl;
long length=i-as+1;
memcpy(buffer+*start,array+as,length*sizeof(double));
*start+=length;
length=ae-i-1;
memcpy(buffer+*end-length,array+i+1,length*sizeof(double));
*end-=length;
}
void *parallelQsort(void *p)
{
pthread_t mid=pthread_self();
cout<<*((unsigned long long*)&mid)<<endl;
para *pa=(para*)p;
long m=nqsort(pa->as,pa->ae,pa->pivot);
pthread_mutex_lock(pa->lock);
mergeInto(pa->bs,pa->be,m,pa->as,pa->ae);
pthread_mutex_unlock(pa->lock);
return NULL;
}
/*
iterative controller
given @num threads available
given the job of sort [@start,@end) of array(g)
divide array(g)[)into @num pieces and each thread sorts one piece using nqsort()
after all sorting merge each piece into buffer(g) using mergeInto()
memcpy all back to array(g)
into next iteration with front and back half of array(g)[)
*/
void sortIte(long start,long end,long num)
{
if(num==1)//only one thread available
{
qsort(start,end);
}
else
{
if(start>end-1)//not many items to sort
{
return ;
}
//parallel
pthread_mutex_t *lock;
lock=new pthread_mutex_t;
pthread_mutex_init(lock,NULL);
pthread_t *threads=new pthread_t[num];
para *ps=new para[num];
long piece=(end-start-1)/num;//task size of one thread
long as,ae,bs=start,be=end;//array buffer range
double pivot=array[start];
as=start+1;
ae=as+piece;
//each thread
for(long i=0;i<num;++i)
{
if(i+1==num)//last thread must finish all task
{
ae=end;
}
/*
//serial
int m=nqsort(as,ae,pivot);
//cout<<i<<" "<<start<<" "<<end<<endl;
mergeInto(&bs,&be,m,as,ae);
//update task
*/
//parallel
//cout<<as<<" c "<<ae<<endl;
//sleep(1);
ps[i].as=as;
ps[i].ae=ae;
ps[i].bs=&bs;
ps[i].be=&be;
ps[i].lock=lock;
ps[i].pivot=pivot;
pthread_create(&threads[i],NULL,parallelQsort,&ps[i]);
as=ae;
ae+=piece;
}
//parallel
for(long i=0;i<num;++i)
{
pthread_join(threads[i],NULL);
}
//set pivot
buffer[bs]=array[start];
memcpy(array+start,buffer+start,(end-start)*sizeof(double));//copy back
//parallel
delete[] threads;
delete[] ps;
pthread_mutex_destroy(lock);
delete lock;
//next iteration
sortIte(start,bs,num/2);
sortIte(bs+1,end,num/2);
}
}
void showArray(double *array,long length)
{
for(long i=0;i<length;++i)
{
cout<<array[i]<<" ";
}
cout<<endl;
}
long testArray(double *array,long length)
{
for(long i=1;i<length;++i)
{
if(array[i-1]>array[i])
{
return i-1;
}
}
return -1;
}
int _tmain(int argc, _TCHAR* argv[])
{
long size=2000*10000;
cout<<size<<endl;
array=new double[size];
for(long i=0;i<size;++i)
{
array[i]=rand();
}
buffer=new double[size];
//test section
cout<<"start"<<endl;
clock_t st,et;
st=clock();
long start=0,end=size;
sortIte(start,end,8);
et=clock();
cout<<"end using "<<(double)(et-st)/CLOCKS_PER_SEC<<endl;
cout<<testArray(array,size);
cin>>array[0];
//end test
delete[] buffer;
delete[] array;
return 0;
}
测试结果很蛋疼,加速比基本都是1,但是要比原版的稳定,而且快。这个版本改进余地也很大。
1.由于每次迭代的并行度相同,可以考虑只建一次线程,复用。这个会是一个生产者消费者问题,代码略复杂。
2.memcpy可以放到主线程来做,而不是在子线程,这样少一个锁,可能会快一点。
3.可以改变一下线程的分配方式,按任务量分个数。把区间的任务抽象出来,做task,然后改成fork/join的模式。(这个太高端。。。)