代码说明
1.radixSort是纯粹基数排序接口,正常基数排序是以十进制的位数排序,这里用二进制排序 interval是排序的位数,maxBit是被排序数最大位数
2.radixSortEx则是在里面用多线程并发对数组进行基数排序后再将各个部分进行归并排序
#include<execution>
#include<vector>
#include<algorithm>
#include<chrono>
#include<iostream>
#include<thread>
#define MAX_BIT 32
#define LEN 1000000
#define GetTime (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count())
//fls(x)=floor(log2(x))+1
static inline int fls(int x) {
int r = 32;
if (!x) return 0;
if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u)) {
x <<= 1;
r -= 1;
}
return r;
}
void radixSort(std::vector<unsigned int>& data, unsigned int interVal = 16, unsigned int maxBit = 32)
{
int mask = 0;
for (int i = 0; i < interVal; i++) mask |= 1 << i;
unsigned int* radixUpperBounder = new unsigned int[2 << interVal]{ 0 };
unsigned int* radix = new unsigned int[data.size()]{ 0 };
unsigned int* preSeq = new unsigned int[data.size()]{ 0 };
unsigned int* seq = new unsigned int[data.size()]{ 0 };
for (int i = 0; i < data.size(); i++) radixUpperBounder[radix[i] = (data[i] & mask)]++;
for (int i = 1; i < (2 << interVal); i++) radixUpperBounder[i] += radixUpperBounder[i - 1];
for (int i = 0; i < data.size(); i++) seq[--radixUpperBounder[radix[i]]] = i;
for (int j = 1; j <= (maxBit - 1) / interVal; j++)
{
std::swap(preSeq, seq);
memset(radixUpperBounder, 0, sizeof(unsigned int) * (2 << interVal));
for (int i = 0; i < data.size(); i++) radixUpperBounder[radix[i] = ((data[i] >> (interVal * j)) & mask)]++;
for (int i = 1; i < (2 << interVal); i++) radixUpperBounder[i] += radixUpperBounder[i - 1];
for (int i = data.size() - 1; i >= 0; i--) seq[--radixUpperBounder[radix[preSeq[i]]]] = preSeq[i];
}
for (int i = 0; i < data.size(); i++) preSeq[i] = data[seq[i]];
data = std::vector<unsigned int>(preSeq, preSeq + data.size());
delete[] radixUpperBounder;
delete[] radix;
delete[] preSeq;
delete[] seq;
return;
}
void radixSortEx(std::vector<unsigned int>&data, unsigned int interVal = 16, unsigned int maxBit = 32,unsigned int maxPar = 4)
{
maxPar = data.size() < 100000 || maxPar == 0 ? 1 : std::min(unsigned int (1<<(fls(maxPar)-1)), std::thread::hardware_concurrency());
int blockSize = data.size() / maxPar;
std::vector<unsigned int> pars(maxPar, 0);
std::vector<std::vector<unsigned int>>datas(maxPar);
auto start = data.begin();
auto end = start + blockSize;
for (int i = 0; i < pars.size(); i++) pars[i] = i;
for (int i = 0; i < maxPar; i++)
{
if (i == maxPar - 1)
datas[i] = std::vector<unsigned int>(start, data.end());
else
datas[i] = std::vector<unsigned int>(start, end);
start += blockSize;
end += blockSize;
}
std::for_each(std::execution::par, pars.begin(), pars.end(), [&datas,interVal,maxBit](int id)
{
radixSort(datas[id], interVal, maxBit);
});
while (true)
{
if (!(maxPar >>= 1))
{
data = std::move(datas[0]);
break;
}
std::vector<std::vector<unsigned int>>tmp(maxPar);
std::for_each(std::execution::par, pars.begin(), pars.begin()+maxPar, [&datas,&tmp,maxPar](int id)
{
const auto& array1 = datas[id << 1];
const auto& array2 = datas[(id << 1) + 1];
int i = 0, j = 0;
while (i < array1.size() || j < array2.size())
{
if (i == array1.size())
tmp[id].push_back(array2[j++]);
else if (j == array2.size())
tmp[id].push_back(array1[i++]);
else
{
if (array1[i] < array2[j])
tmp[id].push_back(array1[i++]);
else
tmp[id].push_back(array2[j++]);
}
}
});
datas = std::move(tmp);
}
return;
}
int main()
{
srand(1111);
int mask = 0;
for (int i = 0; i < MAX_BIT; i++) mask |= 1 << i;
for (int j = 0; j < 8; j++)
{
std::vector<unsigned int> data;
std::vector<unsigned int> check;
std::vector<unsigned int> ans;
for (int i = 0; i < (LEN) << j; ++i) {
data.push_back((std::rand() << 16 | std::rand()) & mask);
}
printf("sort %d numbers\n\n", data.size());
check = data;
long long t = GetTime;
std::sort(check.begin(), check.end());
t = GetTime - t;
printf("std::sort cost:%lldms\n", t);
ans = data;
t = GetTime;
radixSort(ans, 4, MAX_BIT);
printf("radixSort interval=%d isSorted:%d cost:%lldms\n", 4, check == ans, GetTime - t);
ans = data;
t = GetTime;
radixSort(ans, 8, MAX_BIT);
printf("radixSort interval=%d isSorted:%d cost:%lldms\n", 8, check == ans, GetTime - t);
ans = data;
t = GetTime;
radixSort(ans, 16, MAX_BIT);
printf("radixSort interval=%d isSorted:%d cost:%lldms\n", 16, check == ans, GetTime - t);
ans = data;
t = GetTime;
radixSortEx(ans, 16, MAX_BIT, 2);
printf("radixSortEx interval=16 par=2 isSorted:%d cost:%lldms\n", check == ans, GetTime - t);
ans = data;
t = GetTime;
radixSortEx(ans, 16, MAX_BIT, 4);
printf("radixSortEx interval=16 par=4 isSorted:%d cost:%lldms\n", check == ans, GetTime - t);
ans = data;
t = GetTime;
radixSortEx(ans, 16, MAX_BIT, 8);
printf("radixSortEx interval=16 par=8 isSorted:%d cost:%lldms\n", check == ans, GetTime - t);
printf("\n\n");
}
}
测试结果
1.理论上基数排序应该随着数据量增大要比快排快,可是实际基数排序越来越慢。
因为cpu多级缓存机制导致基数排序其实大多数时间开销不在算法逻辑本身而在读取地址数据时在硬件层面弱于快排
2.基于execution的并发还是有优化效果的,在整体耗时低时不明显,因为测试电脑为4核8线程所以8线程情况下8线程的超线程效果未必好
sort 1000000 numbers
std::sort cost:72ms
radixSort interval=4 isSorted:1 cost:53ms
radixSort interval=8 isSorted:1 cost:25ms
radixSort interval=16 isSorted:1 cost:19ms
radixSortEx interval=16 par=2 isSorted:1 cost:19ms
radixSortEx interval=16 par=4 isSorted:1 cost:26ms
radixSortEx interval=16 par=8 isSorted:1 cost:29ms
sort 2000000 numbers
std::sort cost:149ms
radixSort interval=4 isSorted:1 cost:123ms
radixSort interval=8 isSorted:1 cost:60ms
radixSort interval=16 isSorted:1 cost:45ms
radixSortEx interval=16 par=2 isSorted:1 cost:48ms
radixSortEx interval=16 par=4 isSorted:1 cost:49ms
radixSortEx interval=16 par=8 isSorted:1 cost:59ms
sort 4000000 numbers
std::sort cost:315ms
radixSort interval=4 isSorted:1 cost:318ms
radixSort interval=8 isSorted:1 cost:159ms
radixSort interval=16 isSorted:1 cost:120ms
radixSortEx interval=16 par=2 isSorted:1 cost:113ms
radixSortEx interval=16 par=4 isSorted:1 cost:116ms
radixSortEx interval=16 par=8 isSorted:1 cost:151ms
sort 8000000 numbers
std::sort cost:640ms
radixSort interval=4 isSorted:1 cost:717ms
radixSort interval=8 isSorted:1 cost:364ms
radixSort interval=16 isSorted:1 cost:274ms
radixSortEx interval=16 par=2 isSorted:1 cost:225ms
radixSortEx interval=16 par=4 isSorted:1 cost:236ms
radixSortEx interval=16 par=8 isSorted:1 cost:390ms
sort 16000000 numbers
std::sort cost:1281ms
radixSort interval=4 isSorted:1 cost:1500ms
radixSort interval=8 isSorted:1 cost:809ms
radixSort interval=16 isSorted:1 cost:612ms
radixSortEx interval=16 par=2 isSorted:1 cost:504ms
radixSortEx interval=16 par=4 isSorted:1 cost:505ms
radixSortEx interval=16 par=8 isSorted:1 cost:608ms
sort 32000000 numbers
std::sort cost:2686ms
radixSort interval=4 isSorted:1 cost:3183ms
radixSort interval=8 isSorted:1 cost:1739ms
radixSort interval=16 isSorted:1 cost:1455ms
radixSortEx interval=16 par=2 isSorted:1 cost:1107ms
radixSortEx interval=16 par=4 isSorted:1 cost:969ms
radixSortEx interval=16 par=8 isSorted:1 cost:1206ms
sort 64000000 numbers
std::sort cost:5588ms
radixSort interval=4 isSorted:1 cost:6695ms
radixSort interval=8 isSorted:1 cost:3698ms
radixSort interval=16 isSorted:1 cost:3329ms
radixSortEx interval=16 par=2 isSorted:1 cost:2546ms
radixSortEx interval=16 par=4 isSorted:1 cost:2296ms
radixSortEx interval=16 par=8 isSorted:1 cost:2506ms
sort 128000000 numbers
std::sort cost:11653ms
radixSort interval=4 isSorted:1 cost:14531ms
radixSort interval=8 isSorted:1 cost:10019ms
radixSort interval=16 isSorted:1 cost:19132ms
radixSortEx interval=16 par=2 isSorted:1 cost:11088ms
radixSortEx interval=16 par=4 isSorted:1 cost:8212ms
radixSortEx interval=16 par=8 isSorted:1 cost:7932ms