1亿个整数求top 10000

    参考资料:

http://bbs.csdn.net/topics/250038051



【1】遍历数组,求出最大值,与arr[0]元素交换,再在arr[1]到arr[100000000-1]之间求最大值,与arr[1]交换。。类似选择排序。

#include <iostream>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <stdlib.h>
using namespace std;

const int N=100000000; // 1亿
const int M=10000;     // top 1万
const int x=0x40000000-1;
int arr[N];
int main()
{
    // 生成随机数
    int i=0;
    for(i=0; i<N;i++)  
        arr[i]=rand() & x;
    
    struct timeval starttime,endtime;
    gettimeofday(&starttime,0);
    
    // 选择排序。记录各个最大值
    for(i=0;i<M; i++)
    {
        int j;
        int m=i;
        for(j=i+1; j<N; j++)   
        {
            if(arr[j]>arr[m])
                m=j;
        }
        if(i!=m)
        {
            int t=arr[i];
            arr[i]=arr[m];
            arr[m]=t;
        }
        
    }
    
    gettimeofday(&endtime,0);
    double timeuse = 1000000*(endtime.tv_sec - starttime.tv_sec) + endtime.tv_usec - starttime.tv_usec;
    timeuse /=1000;
    cout<<timeuse<<" ms"<<endl;
    
    cout<<"0-10"<<endl;
    copy(arr,arr+5, ostream_iterator<int>(cout, " ") );
    cout<<"9990-10000"<<endl;
    copy(arr+9990,arr+10000, ostream_iterator<int>(cout, " ") );
    
    return 0;
    
}

运行结果:
chen@chen-book1:~$ time ./count_0
-634229 ms
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 9990-10000
1073634978 1073634973 1073634956 1073634953 1073634935 1073634926 1073634918 1073634907 1073634906 1073634904 
real	61m3.238s
user	60m57.797s
sys	0m0.580s
chen@chen-book1:~$ 
一个小时。。等不住了吃完饭回来才跑完。

【2】对这1万个数排序,遍历剩下的数字,如果比一万个数字中的最小值大,那么将其插入到合适位置。

#include <iostream>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <string.h>
#include <algorithm>
using namespace std;

const int N=100000000;
const int M=10000;
const int x=0x40000000-1;
int arr[N];
inline void swap(int &a, int &b)
{
    int t=a;
    a=b;
    b=t;
}
int main()
{
    int i=0;
    for(i=0; i<N;i++)  
        arr[i]=rand() & x;
              
    struct timeval starttime,endtime;
    gettimeofday(&starttime,0);
    //
    sort(arr, arr+M);
    for(i=M; i<N; i++)
    {
        if(arr[0] < arr[i])
            swap(arr[0], arr[i]);
        else
            continue;
            
        // dirty
        // 1优化
        if(arr[0] > arr[M-1] )   //优化
        {
            int t=arr[0];
            memmove(arr, arr+1, sizeof(int)*(M-1) );
            arr[M-1] = t;
        }else
        {
            int j;
            for(j=1; arr[0]>arr[j]; j++);
            if(j==1) continue;
            int t=arr[0];
            memmove(arr, arr+1, sizeof(int)* (j-1) );
            arr[j-1]=t;
        }
        /* 2 去掉优化
        {
            int j;
            for(j=1; j<M && arr[0]>arr[j]; j++);
            if(j==1) continue;
            int t=arr[0];
            memmove(arr, arr+1, sizeof(int)* (j-1) );
            arr[j-1]=t;
            
        }*/ // 
            /* 3 二分查找
            int *p=lower_bound(arr+1, arr+M, arr[0]);
            if(p!=arr)
            {
                int t=arr[0];
                memcpy(arr,arr+1,sizeof(int) * (p-arr-1) );
                *(p-1)=t;
            }*/


        
    }
    //
    gettimeofday(&endtime,0);
    if(endtime.tv_usec<starttime.tv_usec)
    {
        endtime.tv_sec--;
        endtime.tv_usec+=1000000;
    }
    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;
    
    cout<<"0-10"<<endl;
    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );
    cout<<endl;
    cout<<"90-100"<<endl;
    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );
    
    return 0;
    
}


运行

chen@chen-book1:~$ time ./count_2
2 s,  106089 us
0-10
1073634904 1073634906 1073634907 1073634918 1073634926 1073634935 1073634953 1073634956 1073634973 1073634978 
90-100
1073635879 1073635888 1073635895 1073635899 1073635902 1073635906 1073635909 1073635918 1073635937 1073635945 
real    0m4.605s
user    0m4.212s
sys    0m0.352s


运算时间居然已经到了2.1ms!总时间4.6s,主要花在了生成随机数上。这里有个优化但是没有发挥作用,去掉优化:
chen@chen-book1:~$ time ./count_2
2 s,  40210 us

二分:

chen@chen-book1:~$ time ./count_2
0 s,  578971 us
0-10
1073634904 1073634906 1073634907 1073634918 1073634926 1073634935 1073634953 1073634956 1073634973 1073634978
90-100
1073635879 1073635888 1073635895 1073635899 1073635902 1073635906 1073635909 1073635918 1073635937 1073635945
real    0m3.285s
user    0m2.864s
sys    0m0.388s

优化:2.4s

去掉优化:2.1s

二分查找:579ms

记得当M=100时,二分的效果还比前两者要差,M=10000的时候就已经是前者的1/4了!半秒中有木有!!

PS:N=4亿时:

优化: 3.5s

去掉优化:3.3s

二分查找:1.85s


【3】堆排序。

建立一个小根堆,然后后面的依次跟堆顶比,如果比堆顶大,那么就跟堆顶换,然后调整堆。

#include <iostream>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <string.h>
#include <algorithm>
using namespace std;

const int N=100000000;
const int M=10000;
const int x=0x40000000-1;
int arr[N];

inline void swap(int &a, int &b)
{
    int t=a;
    a=b;
    b=t;
}
struct cmp
{
   bool operator()(int &a, int &b){return a>=b;}
};
int main()
{

    int i=0;
    for(i=0; i<N;i++)  
        arr[i]=rand() & x;
              
    struct timeval starttime,endtime;
    gettimeofday(&starttime,0);

    make_heap(arr, arr+M, cmp());
    for(i=M; i<N; i++)
    {
        if(arr[i]>arr[0])
            swap(arr[i],arr[0]);
        __adjust_heap(arr,0,M,arr[0],cmp());
    }
    gettimeofday(&endtime,0);
    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;
    
    cout<<"0-10"<<endl;
    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );
    cout<<endl;
    cout<<"90-100"<<endl;
    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );


/*
int a[10]={3,36,12,13,6,78,34,2,5,7};
copy(a,a+10, ostream_iterator<int>(cout, " ") );
cout<<endl;
make_heap(a,a+5, cmp());
copy(a,a+10, ostream_iterator<int>(cout, " ") );
cout<<endl;
for(i=5;i<10;i++)
{
    if(a[i]>a[0])
        swap(a[i],a[0]);
    __adjust_heap(a,0,5,a[0],cmp());
}
copy(a,a+10, ostream_iterator<int>(cout, " ") );
cout<<endl;
*/
    return 0;
    
}

运行:
chen@chen-book1:~$ time ./count_2
34 s,  -60310 us


34s!我最爱的堆排序,怎么会这么慢。。。

PS:程序貌似有问题

【4】当然,还有直接对整个数组排序,然后取前1万个。。。排序用快排

#include <iostream>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <string.h>
#include <algorithm>
using namespace std;

const int N=100000000;
const int M=10000;
const int x=0x40000000-1;
int arr[N];

inline void swap(int &a, int &b)
{
    int t=a;
    a=b;
    b=t;
}
struct cmp
{
   bool operator()(const int &a, const int &b){return a>=b;}
};
int main()
{

    int i=0;
    for(i=0; i<N;i++)  
        arr[i]=rand() & x;
              
    struct timeval starttime,endtime;
    gettimeofday(&starttime,0);

    //

    sort(arr, arr+N, cmp() );
    //
    gettimeofday(&endtime,0);
    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;
    
    cout<<"0-10"<<endl;
    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );
    cout<<endl;
    cout<<"90-100"<<endl;
    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );

    return 0;
    
}
运行:
chen@chen-book1:~$ time ./count_2
44 s,  747807 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754 
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789 
real	0m47.247s
user	0m46.807s
sys	0m0.340s

先快排再取前10000,用时44s。要是换成堆排序呢?把sort改成sort_heap,运行:

chen@chen-book1:~$ time ./count_2
67 s,  -486231 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789
real    1m9.013s
user    1m8.540s
sys    0m0.340s

用了67s。快排的确是N logN排序算法里最快的。

【5】部分排序。STL里有,基于堆排序的,看看效果如何!

#include <iostream>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <string.h>
#include <algorithm>
using namespace std;

const int N=100000000;
const int M=10000;
const int x=0x40000000-1;
int arr[N];

inline void swap(int &a, int &b)
{
    int t=a;
    a=b;
    b=t;
}
struct cmp
{
   bool operator()(const int &a, const int &b){return a>=b;}
};
int main()
{

    int i=0;
    for(i=0; i<N;i++)  
        arr[i]=rand() & x;
              
    struct timeval starttime,endtime;
    gettimeofday(&starttime,0);

    //

    partial_sort(arr, arr+M, arr+N, cmp() );
    //
    gettimeofday(&endtime,0);
    if(endtime.tv_usec<starttime.tv_usec)
    {
        endtime.tv_sec--;
        endtime.tv_usec+=1000000;
    }
    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;
    
    cout<<"0-10"<<endl;
    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );
    cout<<endl;
    cout<<"90-100"<<endl;
    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );

    return 0;
    
}


运行:

chen@chen-book1:~$ time ./count_2
0 s,  729513 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789
real    0m3.282s
user    0m2.896s
sys    0m0.312s


用了730ms,很不错了!可是为什么我自己用堆排序来模拟,就要那么久呢!!

PS:当N为4亿时,时间为2.65s。

【6】类似计数排序。按比特位,先申请32长度的数组,第一个表示元素表示最高比特位为第0比特的元素个数,第31个元素表示最高比特位为31的元素个数。

遍历数组,得到了直方图。然后从后向前算,就可以知道,最高比特位为多少的整数,是肯定在top10000里的,例如,算得最高比特位为20的整数,肯定在top 10000里,

而最高比特位为19的整数,则有一部分是,有一部分不是。于是,再遍历一遍,将最高比特位为20的放在整个数组开头,再将最高比特位为19的紧随其后,再对最高比特位为19的这部分做部分排序即可!

#include <iostream>
#include <fstream>
#include <iterator>
#include <sys/time.h>
#include <string.h>
#include <algorithm>
using namespace std;

const int N=100000000;
const int M=10000;
const int x=0x40000000-1;
int arr[N];

inline void swap(int &a, int &b)
{
    int t=a;
    a=b;
    b=t;
}
struct cmp
{
   bool operator()(const int &a, const int &b){return a>=b;}
};
int hbit(int x)
{
    int i=0;
    while(x>>=1)i++;
    return i;
}
int main()
{

    int i=0;
    for(i=0; i<N;i++)  
        arr[i]=rand() & x;
    
    int bits[32]={0};
    struct timeval starttime,endtime;
    gettimeofday(&starttime,0);

    //

    for(i=0; i<N; i++)
    {
        bits[ hbit(arr[i]) ]++;
    }
    int s=0;
    for(i=32-1; i>=0; i--)
    {
        s+=bits[i];
        if(s>M)break;
    }
    int s0=s;
    s-=bits[i];
    int threshold= (1<<(i+1) );

    i=0;
    while(arr[i]>=threshold) ++i;
    int j=i+1;
    for(; j<N; j++)
    {
        if(arr[j]>=threshold)
        {
            swap(arr[i], arr[j]);
            ++i; 
        }
    }
    threshold>>=1;
    if(s<M)
    {
        while(arr[i]>=threshold) ++i;
        j=i+1;
        for(; j<N; j++)
        {
            if(arr[j]>=threshold)
            {
                swap(arr[i], arr[j]);
                ++i; 
            }
            
        }
        partial_sort(arr+s, arr+M, arr+s0, cmp());
        
    }
    //
    gettimeofday(&endtime,0);
    if(endtime.tv_usec<starttime.tv_usec)
    {
        endtime.tv_sec--;
        endtime.tv_usec+=1000000;
    }
    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;
    
    cout<<"0-10"<<endl;
    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );
    cout<<endl;
    cout<<"90-100"<<endl;
    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );

    return 0;
    
}

运行:

chen@chen-book1:~$ time ./count_2
13 s,  863604 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789
real    0m16.352s
user    0m15.865s
sys    0m0.420s

13秒。怎么会这么久。。可能是移动的地方太多了。。

PS:N=4亿时,时间为55s。


【7】nth_element(arr,arr+M,arr+N, greater<int>() );

耗时:2.9s。


综上,第一名:对1万个小数组维持排序;

            第二名:partial_sort based on heap


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值