提取某日访问次数最多的那个IP

最新推荐文章于 2023-02-21 15:53:24 发布
MeConverse
最新推荐文章于 2023-02-21 15:53:24 发布
阅读量717
点赞数
分类专栏：算法文章标签： buffer file ios mapreduce 测试 ini
算法专栏收录该内容
7 篇文章 0 订阅
订阅专栏
海量数据日志中，提取出某日访问次数最多的那个IP。
         思路：对于海量数据的处理，主要采取的策略就是分而治之，即缩减问题的规模，将一个大的问题划分成若干等价的小问题。然后解决这些小问题，最后将获得的小问题解综合起来，得出原问题的解。用到比较多的技术主要有散列、位图、堆、trie树、mapreduce、K路归并（败者树）等。其中散列用的尤为多。
          对于本问题，假定某日访问的IP地址已经从数据日志中提取出来，存放在一个大的二进制文件中。下面的工作主要是找目标IP——文件中出现次数最多的那个IP。这个文件很大，内存无法完全放下，内排序的方法行不通。可以采取如下措施：
         （1）利用散列函数，将大文件中的IP地址散列到若干个文件中。相同的IP地址肯定在同一个文件中。
         （2）处理每个小文件，找到该文件中出现次数最多的那个IP，记录下IP地址和出现次数。可以用hash_map，IP地址为键值、出现次数为数值。
         （3）将第（2）步中找到的IP地址及出现次数综合起来，找到这些IP地址中出现次数最多的那个IP。
          简单实现：接下来给出一种简单的实现，效率比较低。测试中，从一个含4亿个IP地址的文件中提取目标IP，一共用了52分钟。其中大量的时间用于文件的读写，约为30分钟。另外有7分钟用于产生含4亿个随机数的文件。真正用于计算的时间为15分钟。由于C++标准STL中没有hash_map，因此该用map实现第（2）步，如果改用hash_map，应该能减少部分计算的时间。
         另外，如果设置读写缓冲区，经过测试，缓冲区为128字节时，读写文件的时间从原来的30分钟减为25分钟左右。进一步增大缓冲区大小，提升的速度比非常小，待求解。这里设置缓冲区不是指这种方式：
 char buffer[1024]; 
 streambuf * ptrbuf = outFile.rdbuf(); 
  ptrbuf-> pubsetbuf(buffer,1024); 
 而是定义一个整形数组，每次读写时，读写一块数据而不是一个整数。
 单个读写   outFile.write((char*)&x,sizeof(unsigned));
 
 块读写       outFile.write((char *)buffer,BUFFER_SIZE*sizeof(unsigned));
 
 VC6.0下编译运行通过
 
 
    view plain 
   
 #pragma warning(disable:4786) //VC6.0中 忽略警告  
 #include <fstream>  
 #include <iostream>  
 #include <map>  
 #include <string>  
 #include <ctime>  
 using namespace std;  
   
 const unsigned N=400000000;      //随机产生的IP地址数  
 const unsigned FILE_NUM=16;      //产生的小文件个数  
 const unsigned HASH_SHIFT=28;    //散列值的位移量  
   
 inline unsigned HashInt(unsigned value); //将整数散列到0到FILE_NUM之间  
 bool ProduceIP(string fileName);                //随机产生IP地址，看成是32位无符号数  
 bool DecomposeFile(string fileName);     //分而治之，将大文件分为若干个小文件  
 bool FindTargetIP(unsigned result[2]);      //找到出现次数最多的IP  
   
 int main()  
 {  
     unsigned start,end;      //记录总的运行时间  
     unsigned start1,end1;  //产生大文件的时间  
     unsigned start2,end2;  //分解大文件的时间  
     unsigned start3,end3;  //找出现IP次数最多的时间  
   
     string name="IP.bin";       //大文件  
     unsigned result[2]={0,0};   //保存结果  
   
     start=clock();  
     start1=clock();  
     //随机产生大量IP  
     if(ProduceIP(name)==false)  
         return 1;  
     end1=clock();  
   
     start2=clock();  
     //分而治之  
     if(DecomposeFile(name)==false)  
         return 1;  
     end2=clock();  
   
     start3=clock();  
     //找到出现次数最多的IP  
     if(FindTargetIP(result)==false)  
         return 1;  
     end3=clock();  
     end=clock();  
   
     //打印结果  
     cout<<"total run time : "<<(end-start)/1000.0<<endl;  
     cout<<"ProduceIP() run time : "<<(end1-start1)/1000.0<<endl;  
     cout<<"DecomposeFile() run time : "<<(end2-start2)/1000.0<<endl;  
     cout<<"FindTargetIP() run time : "<<(end3-start3)/1000.0<<endl;  
     cout<<"IP : "<<(result[0]>>24)<<'.'<<((result[0]&0x00ff0000)>>16)<<'.';  
     cout<<((result[0]&0x0000ff00)>>8)<<'.'<<((result[0]&0x000000ff))<<endl;  
     cout<<"appear time : "<<result[1]<<endl;  
     return 0;  
 }  
 //将整数散列到0到FILE_NUM之间  
 inline unsigned HashInt(unsigned value)  
 {  
     //斐波那契(Fibonacci)散列法 hash_key=(value * M) >> S;  
     //value是16位整数，M = 40503   
     //value是32位整数，M = 2654435769   
     //value是64位整数，M = 11400714819323198485  
     //S与桶的个数有数，如果桶的个数为16，那么S为28  
     //对于32位整数，S=32-log2(桶的个数)  
     return (value*2654435769)>>HASH_SHIFT;   
 }  
 //随机产生IP地址 看成是32位无符号数  
 bool ProduceIP(string fileName)  
 {  
     ofstream outFile(fileName.c_str(),ios::binary);  
     if(!outFile)  
     {  
         cerr<<"error: unable to open output file : "<<fileName<<endl;  
         return false;  
     }  
     srand(time(0));  
     for(unsigned i=0;i<N;i++)  
     {  
         //产生一个大整数用来模拟IP地址  
         unsigned x=((rand()%256)<<24)|((rand()%256)<<16)|((rand()%256)<<8)|(rand()%256);  
         outFile.write((char*)&x,sizeof(unsigned));  
     }  
     return true;  
 }  
 //分而治之 将大文件分为若干个小文件  
 bool DecomposeFile(string fileName)  
 {  
     ofstream outFiles[FILE_NUM];  
     int i;  
     for(i=0;i<FILE_NUM;i++)  
     {  
         //小文件的名称  
         char buffer[10];  
         string name="tmp";  
         itoa(i,buffer,10);  
         name=name+buffer+".bin";  
         //打开小文件  
         outFiles[i].open(name.c_str(),ios::binary);  
         if(!outFiles[i])  
         {  
             cerr<<"error: unable to open output file :"<<name<<endl;  
             return false;  
         }  
     }  
     ifstream inFile(fileName.c_str(),ios::binary);  
     while(inFile.good())   
     {  
         //散列到各个小文件中  
         unsigned int value=0;  
         if(inFile.read((char*)&value,sizeof(unsigned)))  
         {  
             outFiles[HashInt(value)].write((char*)&value,sizeof(unsigned));  
         }  
     }  
     //关闭文件  
     inFile.close();  
     for(i=0;i<FILE_NUM;i++)  
         outFiles[i].close();  
     return true;  
 }  
 //找到出现次数最多的IP  
 bool FindTargetIP(unsigned result[2])  
 {  
     result[0]=0;  
     result[1]=0;  
     for(int i=0;i<FILE_NUM;i++)  
     {  
         char buffer[10];  
         string name="tmp";  
         itoa(i,buffer,10);  
         name=name+buffer+".bin";  
         //处理每个小文件  
         ifstream inFile;  
         inFile.open(name.c_str(),ios::binary);  
         if(!inFile)  
         {  
             cerr<<"error: unable to open input file :"<<name<<endl;  
             return false;  
         }  
         //核心代码，由于STL中没有hash_map，用map来代替  
         map<unsigned,unsigned> ip_count;  
         while(inFile.good())  
         {  
             unsigned key=0;  
             if(inFile.read((char*)&key,sizeof(unsigned)))  
             {  
                 ip_count[key]++;  
             }  
         }  
         map<unsigned,unsigned>::iterator it=ip_count.begin();  
         for(;it!=ip_count.end();it++)  
         {  
             if(it->second>result[1])  
             {  
                 result[0]=it->first;  
                 result[1]=it->second;  
             }  
         }  
         inFile.close();  
     }  
     return true;  
 }  
 
 设置缓冲区后的代码。问题描述见 “ 解题笔记（9）——提取某日访问次数最多的那个IP ”
 
 
    view plain 
   
 #pragma warning(disable:4786) //VC6.0中 忽略警告  
 #include <fstream>  
 #include <iostream>  
 #include <map>  
 #include <string>  
 #include <ctime>  
 using namespace std;  
   
 const unsigned N=400000000;        //随机产生的IP地址数  
 const unsigned FILE_NUM=16;       //产生的小文件个数  
 const unsigned HASH_SHIFT=28;    //散列值的位移量  
 const unsigned BUFFER_SIZE=32;  
   
 inline unsigned HashInt(unsigned value); //将整数散列到0到FILE_NUM之间  
 bool ProduceIP(string fileName);         //随机产生IP地址，看成是32位无符号数  
 bool DecomposeFile(string fileName);     //分而治之，将大文件分为若干个小文件  
 bool FindTargetIP(unsigned result[2]);   //找到出现次数最多的IP  
   
 int main()  
 {  
     unsigned start,end;    //记录总的运行时间  
     unsigned start1,end1;  //产生大文件的时间  
     unsigned start2,end2;  //分解大文件的时间  
     unsigned start3,end3;  //找出现IP次数最多的时间  
   
     string name="IP.bin";       //大文件  
     unsigned result[2]={0,0};   //保存结果  
   
     start=clock();  
     start1=clock();  
     //随机产生大量IP  
     if(ProduceIP(name)==false)  
         return 1;  
     end1=clock();  
   
     start2=clock();  
     //分而治之  
     if(DecomposeFile(name)==false)  
         return 1;  
     end2=clock();  
   
     start3=clock();  
     //找到出现次数最多的IP  
     if(FindTargetIP(result)==false)  
         return 1;  
     end3=clock();  
     end=clock();  
   
     //打印结果  
     cout<<"total run time : "<<(end-start)/1000.0<<endl;  
     cout<<"ProduceIP() run time : "<<(end1-start1)/1000.0<<endl;  
     cout<<"DecomposeFile() run time : "<<(end2-start2)/1000.0<<endl;  
     cout<<"FindTargetIP() run time : "<<(end3-start3)/1000.0<<endl;  
     cout<<"IP : "<<(result[0]>>24)<<'.'<<((result[0]&0x00ff0000)>>16)<<'.';  
     cout<<((result[0]&0x0000ff00)>>8)<<'.'<<((result[0]&0x000000ff))<<endl;  
     cout<<"appear time : "<<result[1]<<endl;  
     return 0;  
 }  
   
 //将整数散列到0到FILE_NUM之间  
 inline unsigned HashInt(unsigned value)  
 {  
     //斐波那契(Fibonacci)散列法 hash_key=(value * M) >> S;  
     //value是16位整数，M = 40503   
     //value是32位整数，M = 2654435769   
     //value是64位整数，M = 11400714819323198485  
     //S与桶的个数有数，如果桶的个数为16，那么S为28  
     //对于32位整数，S=32-log2(桶的个数)  
     return (value*2654435769)>>HASH_SHIFT;   
 }  
 //随机产生IP地址 看成是32位无符号数  
 bool ProduceIP(string fileName)  
 {  
     ofstream outFile(fileName.c_str(),ios::binary);  
     if(!outFile)  
     {  
         cerr<<"error: unable to open output file : "<<fileName<<endl;  
         return false;  
     }  
     srand(time(0));  
   
     unsigned i,j=0;  
     unsigned buffer[BUFFER_SIZE];  
     for(i=0;i<N;i++)  
     {  
         //产生一个大整数用来模拟IP地址  
         unsigned x=((rand()%256)<<24)|((rand()%256)<<16)|((rand()%256)<<8)|(rand()%256);  
         buffer[j++]=x;  
         if(BUFFER_SIZE==j)  
         {  
             outFile.write((char *)buffer,BUFFER_SIZE*sizeof(unsigned));  
             j=0;  
         }  
     }  
     outFile.write((char *)buffer,j*sizeof(unsigned));  
     return true;  
 }  
 //分而治之 将大文件分为若干个小文件  
 bool DecomposeFile(string fileName)  
 {  
     ofstream outFiles[FILE_NUM];  
     int i;  
     for(i=0;i<FILE_NUM;i++)  
     {  
         //小文件的名称  
         char str[10];  
         string name="tmp";  
         itoa(i,str,10);  
         name=name+str+".bin";  
         //打开小文件  
         outFiles[i].open(name.c_str(),ios::binary);  
         if(!outFiles[i])  
         {  
             cerr<<"error: unable to open output file :"<<name<<endl;  
             return false;  
         }  
     }  
     ifstream inFile(fileName.c_str(),ios::binary);  
   
     unsigned buffer[FILE_NUM][BUFFER_SIZE];   
     unsigned j[FILE_NUM]={0};  
     while(inFile.good())  
     {  
         unsigned value;  
         if(inFile.read((char*)&value,sizeof(unsigned)))  
         {  
             unsigned h=HashInt(value);  
             buffer[h][j[h]++]=value;  
             if(BUFFER_SIZE==j[h])  
             {  
                 outFiles[h].write((char *)buffer[h],BUFFER_SIZE*sizeof(unsigned));  
                 j[h]=0;  
             }  
         }  
     }  
     for(i=0;i<FILE_NUM;i++)  
         outFiles[i].write((char *)buffer[i],j[i]*sizeof(unsigned));  
     //关闭文件  
     inFile.close();  
     for(i=0;i<FILE_NUM;i++)  
         outFiles[i].close();  
     return true;  
 }  
 //找到出现次数最多的IP  
 bool FindTargetIP(unsigned result[2])  
 {  
     result[0]=0;  
     result[1]=0;  
     for(int i=0;i<FILE_NUM;i++)  
     {  
         char str[10];  
         string name="tmp";  
         itoa(i,str,10);  
         name=name+str+".bin";  
         //处理每个小文件  
         ifstream inFile;  
         inFile.open(name.c_str(),ios::binary);  
         if(!inFile)  
         {  
             cerr<<"error: unable to open input file :"<<name<<endl;  
             return false;  
         }  
         //核心代码，由于STL中没有hash_map，用map来代替  
         map<unsigned,unsigned> ip_count;  
         while(inFile.good())  
         {  
             unsigned buffer[BUFFER_SIZE];  
             int readNum=0;  
             inFile.read((char*)buffer,BUFFER_SIZE*sizeof(unsigned));  
             readNum=inFile.gcount()>>2;  
             for(int j=readNum;j>0;j--)  
             {  
                 ip_count[buffer[j-1]]++;  
             }  
         }  
         map<unsigned,unsigned>::iterator it=ip_count.begin();  
         for(;it!=ip_count.end();it++)  
         {  
             if(it->second>result[1])  
             {  
                 result[0]=it->first;  
                 result[1]=it->second;  
             }  
         }  
         inFile.close();  
     }  
     return true;  
 }