大数据量的处理一亿个数排序

最新推荐文章于 2024-04-08 17:47:25 发布

lazyboy_yan

最新推荐文章于 2024-04-08 17:47:25 发布

阅读量1.3k

点赞数

分类专栏：算法与数据结构文章标签： float ios null string delete 算法

算法与数据结构专栏收录该内容

1 篇文章 0 订阅

订阅专栏

有1亿个浮点数，请找出其中最小的10000个。提示：假设每个浮点数占4个字节，1亿个浮点数就要站到相当大的空间，因此不能一次将全部读入内存进行排序。
问题分析：
1) 1亿个浮点数，其数据大小为 400 M。如此规模的排序，首先想到分批处理。每次读取 1 000 000 个数据并进行快速排序。需要的内存空间为 1 000 000 * 4 = 4M。需要100 次这样的排序。

2）完全没的规律的数据，考虑使用快速排序。快速排序的平均复杂度是 O( Nlog(N) )。我们可以直接使用 stl 提供的全局函数 sort() ，它使用了快速排序算法（实际是三平均分区法　median-of-three　）。
3) 最后只要最大的 10000 个。则每个批次只需要保留排序结果的前 10000 个数据。这段数据已经是分段有序的。数据量为 10000 * 100。
解法：
1) 数据结构定义：
定义数据规模：

[cpp] view plain copy print ?

enum
{
batchCapacity = 1000000,
batchCount = 100,
resultCount = 10000
}

２）生成数据样本：

[html] view plain copy print ?

void dataPrepare( const char* filName )
{
float* pbuf;
if( ( pbuf= ( float *)malloc( batchCapacity * sizeof( float ) )) == NULL )
{
throw( "failed to malloc" );
}
// 生成 batchCapacity * batchCount 个随机实数，并保存到文件
ofstream fout;
fout.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
fout.open( filName, ios::binary ) ;
if ( !fout )
{
throw( "file not exits" );
}
for( size_t index = 0; index< batchCount; index++ )
{
for( size_t index = 0; index <batchCapacity; index++ )
{
pbuf[index] = RandomFloat( 0, 65537 );
}
fout.write( (char*)pbuf, batchCapacity * sizeof( float ) );
}
fout.close();
delete pbuf;
pbuf = NULL;
return ;
}

以上，用 RandomFloat() 生成随机数。其定义如下：

[cpp] view plain copy print ?

/********************************************
* Rand::rand 线性同余算法获得随机数
* 会循环出现相同的数。有待改进
*
*********************************************/
#include <cstdlib>
#include <ctime>
class Rand
{
public:
static longlong r;
static int rand()//产生随机数
{
// 三个参数的取值　关键字：辗转相除二次同余
r = ( r * 1010557 + 79390691 ) % 100663363 ;
return r;
}
};
long long Rand::r = 43215;
float RandomFloat( float low,float high) {
float d = float( Rand::rand()) / (float(RAND_MAX) + 1);
return low + d * (high - low);
}

3 ) 排序

[cpp] view plain copy print ?

void dataOrder( constchar* filName )
{
float* pbuf = ( float *)malloc( batchCapacity * sizeof(float ) );
if ( pbuf == NULL )
{
throw( "failed to malloc " );
}
ifstream fin;
ofstream fout;
fin.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
fout.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
fin.open( filName, ios::binary );
fout.open( string( string(filName).append(".order") ).c_str(), ios::binary );
for( size_t index = 0;index < batchCount;index++ )
{
//　分批读入，排序
fin.read( (char*)pbuf, batchCapacity *sizeof( float ) );
std::sort( pbuf, pbuf + batchCapacity );
fout.write( (char*)pbuf, resultCount *sizeof( float ) );
cout << "writed bytes:" << resultCount * index + 1 << endl;
}
fin.close();
fout.close();
delete pbuf;
// 将分组的数据综合排序
pbuf = ( float *)malloc( resultCount * batchCount *sizeof( float ) );
if ( pbuf == NULL )
{
throw( "failed to malloc " );
}
fin.open( string( string(filName).append(".order") ).c_str(), ios::binary );
fin.read( (char*)pbuf, resultCount * batchCount *sizeof( float ) );
std::sort( pbuf, pbuf + resultCount * batchCount );
//merge_sort<float>( pbuf,0,( resultCount * batchCount ) - 1 );
// 输出
for( size_t index = 0; index < resultCount; index++ )
{
printf( "%d\t%f\n", index, pbuf[index ] );
}
fin.close();
delete pbuf;
pbuf = NULL;
}

void dataOrder( const char* filName  )
{
    float* pbuf  = (  float *)malloc( batchCapacity  * sizeof( float ) );
    if ( pbuf == NULL )
    {
        throw( "failed to malloc " );
    }
    ifstream fin;
    ofstream fout;
    fin.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
    fout.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
    fin.open( filName, ios::binary );
    fout.open( string( string(filName).append(".order") ).c_str(), ios::binary ); 
    for( size_t index = 0;index < batchCount;index++ )
    {
        //　分批读入，排序
        fin.read( (char*)pbuf, batchCapacity * sizeof( float )  );
        std::sort( pbuf, pbuf + batchCapacity );
        fout.write( (char*)pbuf,  resultCount * sizeof( float ) );
        cout <<  "writed bytes:"  << resultCount * index + 1 <<  endl;
    }
    fin.close();
    fout.close();
    delete pbuf;
    // 将分组的数据综合排序
    pbuf =  (  float *)malloc( resultCount * batchCount * sizeof( float ) );
    if ( pbuf == NULL )
    {
        throw( "failed to malloc " );
    }
    fin.open( string( string(filName).append(".order") ).c_str(), ios::binary ); 
    fin.read( (char*)pbuf, resultCount * batchCount * sizeof( float )  );
    std::sort( pbuf, pbuf +  resultCount * batchCount  );

//merge_sort<float>(  pbuf,0,( resultCount *  batchCount ) - 1 );
    // 输出
    for(  size_t index = 0; index < resultCount; index++ )
    {
        printf( "%d\t%f\n",  index, pbuf[index ] );
    }
    fin.close();
    delete pbuf;
    pbuf = NULL;
}

性能测试结果： p4 的 cpu，每秒大约处理 30万个记录。
整个程序：

[cpp] view plain copy print ?

const char* filName ="c:\\float.df";
void dataPrepare( constchar* filName );
void helpInfo();
void dataOrder( constchar* filName );
int main( int argc,char* argv[] )
{
try
{
if ( argc == 1 )
{
helpInfo();
return 0;
}
const char* filename = argv[1]+ 2;
if ( filename != NULL && strlen( filename ) > 0 )
{
filName = filename;
}
switch ( argv[1][1] )
{
case 'g':
dataPrepare( filName );
break;
case 'o':
dataOrder( filName );
break;
default:
helpInfo();
return 0;
break;
}
}
catch( constchar* e)
{
cout << e << endl;
}
catch( )
{
cout << "unknown error" << endl;
}
system( "pause" );
return 0;
}

Referance:
快速排序
the c++ programming lanauage, by bjarne stroustrup chapter 18: Algorithms and Function Objects
线性同余法生成随机数
Introduction to Algorithms, Second Edition,by Thomas H. Cormen, Charles E. 11.3 Hash functions 介绍了线性同余法的原理和用法。

三平均分区法：
Introduction to Algorithms, Second Edition,by Thomas H. Cormen, Charles E.　Problems 7-5: Median-of-3 partition