pthread 快排失败的优化版

最新推荐文章于 2023-01-03 17:08:24 发布

为啥强制昵称

最新推荐文章于 2023-01-03 17:08:24 发布

阅读量973

点赞数

分类专栏：做并行代码段

本文链接：https://blog.csdn.net/pouloghost/article/details/8805647

版权

代码段同时被 2 个专栏收录

53 篇文章 0 订阅

订阅专栏

做并行

27 篇文章 1 订阅

订阅专栏

为了改善快排并行度的问题，我设计了如下改进算法：

用第一个元素做pivot，其余元素平均分配给可用线程，并行进行局部的快排（都与pivot做比较）。在作完比较之后，使用额外空间将所有数据归并到一起（使用memcpy）。进行下一次迭代。

然后就出现了一系列的问题，其一就是for循环中声明的变量，在一次迭代结束后会被释放变量所占用内存（貌似是个栈），如果传递指向这块内存的指针，会出错。而且for循环基本内存同一位置放相同类型的变量，调试会很尴尬。

// newQsort.cpp : 定义控制台应用程序的入口点。
//

#include "stdafx.h"
#include <iostream>
#include <pthread.h>
#include <cmath>
using namespace std;
 
double *array,*buffer;
pthread_mutex_t outlock=PTHREAD_MUTEX_INITIALIZER;
typedef struct para
{
	long as;
	long ae;
	double pivot;
	long *bs;
	long *be;
	pthread_mutex_t *lock;
}para;
/*
traditional quick sort
*/
void qsort(long start,long end)
{
	//cout<<"ite"<<endl;
	if(start>end-2)
	{
		return ;
	}
	double pivot=array[start];
	long i=start-1;
	for(long j=start;j<end;++j)
	{
		if(array[j]<pivot)
		{
			++i;
			double tmp=array[i];
			array[i]=array[j];
			array[j]=tmp;
		}
	}
	array[i+1]=pivot;
	qsort(start,i+1);
	qsort(i+2,end);
}
/*
sort the [@start,@end) part of the array(g) 
use the @pivot as pivot
return last index of lt pivot item
*/
long nqsort(long start,long end,double pivot)
{
	if(start>end-2)
	{
		//cout<<"e"<<endl;
		if(array[start]>pivot)
		{
			return start-1;
		}
		else 
		{
			return start;
		}
	}
	//from <introduction to algorthms>
	long i=start-1;
	double tmp;
	for(long j=start;j<end;++j)
	{
		if(array[j]<pivot)
		{
			++i;
			tmp=array[i];
			array[i]=array[j];
			array[j]=tmp;
		}
	}
	return i;
}
/*
merge several middle result into one
put lt pivot items after @*start @i+1 in total
put gt pivot items before @*end @ae-@as-@i-1 in total
change @*start @*end to correct position
use buffer(g) as temp space
*/
void mergeInto(long *start,long *end,long i,long as,long ae)
{
	//cout<<*start<<" "<<*end<<" "<<i<<" "<<as<<" "<<ae<<endl;
	long length=i-as+1;
	memcpy(buffer+*start,array+as,length*sizeof(double));
	*start+=length;
	length=ae-i-1;
	memcpy(buffer+*end-length,array+i+1,length*sizeof(double));
	*end-=length;
}
void *parallelQsort(void *p)
{
	pthread_t mid=pthread_self();
	cout<<*((unsigned long long*)&mid)<<endl;
	para *pa=(para*)p;
	long m=nqsort(pa->as,pa->ae,pa->pivot); 
	pthread_mutex_lock(pa->lock);
	mergeInto(pa->bs,pa->be,m,pa->as,pa->ae);
	pthread_mutex_unlock(pa->lock);
	return NULL;
}
/*
iterative controller
given @num threads available
given the job of sort [@start,@end) of array(g)
divide array(g)[)into @num pieces  and each thread sorts one piece using nqsort()
after all sorting merge each piece into buffer(g) using mergeInto()
memcpy all back to array(g)
into next iteration with front and back half of array(g)[)
*/
void sortIte(long start,long end,long num)
{
	if(num==1)//only one thread available
	{
		qsort(start,end);
	}
	else
	{
		if(start>end-1)//not many items to sort
		{
			return ;
		}
		//parallel
		pthread_mutex_t *lock;
		lock=new pthread_mutex_t;
		pthread_mutex_init(lock,NULL);
		pthread_t *threads=new pthread_t[num];
		para *ps=new para[num];
		long piece=(end-start-1)/num;//task size of one thread 
		long as,ae,bs=start,be=end;//array buffer range
		double pivot=array[start];
		as=start+1;
		ae=as+piece;
		//each thread
		for(long i=0;i<num;++i)
		{
			if(i+1==num)//last thread must finish all task
			{
				ae=end;
			}
			/*
			//serial
			int m=nqsort(as,ae,pivot);
			//cout<<i<<" "<<start<<" "<<end<<endl;
			mergeInto(&bs,&be,m,as,ae);
			//update task
			*/
			//parallel
			//cout<<as<<" c "<<ae<<endl;
			//sleep(1);
			ps[i].as=as;
			ps[i].ae=ae;
			ps[i].bs=&bs;
			ps[i].be=&be;
			ps[i].lock=lock;
			ps[i].pivot=pivot;
			pthread_create(&threads[i],NULL,parallelQsort,&ps[i]);

			as=ae;
			ae+=piece;

		}
		//parallel
		for(long i=0;i<num;++i)
		{
			pthread_join(threads[i],NULL);
		}
		//set pivot
		buffer[bs]=array[start];
		memcpy(array+start,buffer+start,(end-start)*sizeof(double));//copy back
		//parallel
		delete[] threads;
		delete[] ps;
		pthread_mutex_destroy(lock);
		delete lock;

		//next iteration
		sortIte(start,bs,num/2);
		sortIte(bs+1,end,num/2);
	}
}
void showArray(double *array,long length)
{
    for(long i=0;i<length;++i)
    {
        cout<<array[i]<<" ";
    }
    cout<<endl;
}
long testArray(double *array,long length)
{
	for(long i=1;i<length;++i)
	{
		if(array[i-1]>array[i])
		{
			return i-1;
		}
	}
	return -1;
}
int _tmain(int argc, _TCHAR* argv[])
{
	long size=2000*10000;
	cout<<size<<endl;
	array=new double[size];
	for(long i=0;i<size;++i)
	{
		array[i]=rand();
	}
	buffer=new double[size];

	//test section
	cout<<"start"<<endl;
	clock_t st,et;
	st=clock();
	long start=0,end=size;
	sortIte(start,end,8);
	et=clock();
	cout<<"end using "<<(double)(et-st)/CLOCKS_PER_SEC<<endl;
	cout<<testArray(array,size);
    cin>>array[0];
	//end test
	delete[] buffer;
	delete[] array;
    return 0;
}

测试结果很蛋疼，加速比基本都是1，但是要比原版的稳定，而且快。这个版本改进余地也很大。

1.由于每次迭代的并行度相同，可以考虑只建一次线程，复用。这个会是一个生产者消费者问题，代码略复杂。

2.memcpy可以放到主线程来做，而不是在子线程，这样少一个锁，可能会快一点。

3.可以改变一下线程的分配方式，按任务量分个数。把区间的任务抽象出来，做task，然后改成fork/join的模式。（这个太高端。。。）

为啥强制昵称

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
pthread 快排失败的优化版

为了改善快排并行度的问题，我设计了如下改进算法：用第一个元素做pivot，其余元素平均分配给可用线程，并行进行局部的快排（都与pivot做比较）。在作完比较之后，使用额外空间将所有数据归并到一起（使用memcpy）。进行下一次迭代。然后就出现了一系列的问题，其一就是for循环中声明的变量，在一次迭代结束后会被释放变量所占用内存（貌似是个栈），如果传递指向这块内存的指针，会出错。而且for循
复制链接

扫一扫