从海量数据中找出重复次数最多的一个

从海量数据中找出重复次数最多的一个  

2011-05-31 14:12:38|  分类: 海量数据处理|字号 订阅

/************************************************************************/
/* 从海量数据中找出重复次数最多的一个
   思路:先将海量数据通过哈希表统计出数据的频率并映射为100个小文件,小文件
        中的数据包括两项(数值,出现次数),然后再对每一个小文件求出重复次数
  最多的一个数据然后将各个小文件出现最多的数据项目通过二路归并进行比
  较,找出频率最大的即为所求
   性能:时间复杂度:O(N)+100*O(N1)+O(nlogn)*/
/************************************************************************/

#include<iostream>
#include<fstream>
#include<malloc.h>
#include<stdlib.h>
const int ERROR=0;
using namespace std;

struct LinkHash//哈希表
{
 LinkHash *next;
 int m_nValue;
 int count;//数据出现的次数
};
struct _Data//数据结构体
{
 int Value;
 int Count;
};
char *file[101]=
{"file1.txt","file2.txt","file3.txt","file4.txt","file5.txt","file6.txt","file7.txt","file8.txt","file9.txt","file10.txt",
"file11.txt","file12.txt","file13.txt","file14.txt","file15.txt","file16.txt","file17.txt","file18.txt","file19.txt","file20.txt",
"file21.txt","file22.txt","file23.txt","file24.txt","file25.txt","file26.txt","file27.txt","file28.txt","file29.txt","file30.txt",
"file31.txt","file32.txt","file33.txt","file34.txt","file35.txt","file36.txt","file37.txt","file38.txt","file39.txt","file40.txt",
"file41.txt","file42.txt","file43.txt","file44.txt","file45.txt","file46.txt","file47.txt","file48.txt","file49.txt","file50.txt",
"file51.txt","file52.txt","file53.txt","file54.txt","file55.txt","file56.txt","file57.txt","file58.txt","file59.txt","file60.txt",
"file61.txt","file62.txt","file63.txt","file64.txt","file65.txt","file66.txt","file67.txt","file68.txt","file69.txt","file70.txt",
"file71.txt","file72.txt","file73.txt","file74.txt","file75.txt","file76.txt","file77.txt","file78.txt","file79.txt","file80.txt",
"file81.txt","file82.txt","file83.txt","file84.txt","file85.txt","file86.txt","file87.txt","file88.txt","file89.txt","file90.txt",
"file91.txt","file92.txt","file93.txt","file94.txt","file95.txt","file96.txt","file97.txt","file98.txt","file99.txt","file100.txt"};
class CHashTable
{
private:
 LinkHash *HashTable[101];//10个空哈希表头
public:
 CHashTable();
 ~CHashTable();

 void HashCollision(int data);
 void WriteToFile();
 _Data GetMaxFreq(char *filename);

};
CHashTable::CHashTable()
{
 
 int i;
 for(i=0;i<100;i++)//初始化空链表
 {
  HashTable[i]=(LinkHash*)malloc(sizeof(LinkHash));
  if(!HashTable[i])
   exit(ERROR);
  HashTable[i]->count=0;
  HashTable[i]->next=NULL;
  HashTable[i]->m_nValue=-1;
 }
}
CHashTable::~CHashTable()
{

}
int HashFunc(int key)//哈希函数
{
 return key%100;
}

void CHashTable::HashCollision(int data)//链地址法处理冲突
{
 LinkHash *newNode;
 LinkHash *head;
 newNode=(LinkHash*)malloc(sizeof(LinkHash));
 if(!newNode)
  exit(ERROR);
 newNode->next=NULL;
 newNode->m_nValue=data;
 newNode->count=0;

 int p;
 bool isRep=false;//重复出现
 p=HashFunc(data);
 head=HashTable[p];
 while(head->next)
 {
  head=head->next;
  if(head->m_nValue==data)
  {
   head->count++;//有重复的数据统计出现的次数
   isRep=true;
   break;
  }
  
 }
 if(isRep==false)//如果没有重复的数据,则将数据插入
 {
  head->next=newNode;
     head=newNode;
     head->count++;
 }

 
}
void CHashTable::WriteToFile()//将结果写入100个小文件中
{
 int i;
 ofstream fout;
 for(i=0;i<100;i++)
 {
  LinkHash *p;
  fout.open(file[i]);
   if(HashTable[i]->next)
   {
    p=HashTable[i]->next;
    while(p)
    {
     fout<<p->m_nValue<<" "<<p->count<<endl;
     p=p->next;
    }
   }
  fout.close();
  fout.clear();
 }
}

_Data CHashTable::GetMaxFreq(char *filename)//遍历文件中数据执行次数T(n)=O(N1)
{//从文件中获取出现频率最多的数据
 fstream fin;
 _Data InData;
 _Data MaxData;
 MaxData.Count=0;
 fin.open(filename);
 if(fin.is_open())
 {
  while(fin>>InData.Value>>InData.Count)
  {
   if(InData.Count>MaxData.Count)
    MaxData=InData;
  }
 }
 fin.close();
 return MaxData;
}
void BiSearchMax(_Data Array[],int start,int end,_Data &Max)//二路归并数组中数据频率最大的值
{
 _Data Max1;
 Max1.Count=-1;
 int mid;
 if(start==end)
  Max=Array[start];
 else if(end-start+1==2)
 {
  if(Array[start].Count>Array[end].Count)
   Max=Array[start];
  else
   Max=Array[end];
 }
 else
 {
      mid=(start+end)/2;
    BiSearchMax(Array,start,mid,Max);
    BiSearchMax(Array,mid+1,end,Max1);

 }
 if(Max1.Count>Max.Count)
  Max=Max1;
}
int main()
{
 CHashTable HTable=CHashTable();
    fstream fin;
 ofstream fout;
 int i,data,indata;
 _Data FileData[101];
 _Data MaxFreq;
 MaxFreq.Count=0;
 
 fout.open("input.txt");
 for(i=0;i<1000000;i++)//生成100000个数据
 {
  data=1+rand()%1000;//从1到1000的随机数
  fout<<data<<" ";
 }
 fout.close();
 
 fin.open("input.txt");
 if(fin.is_open())
 {
  while(fin>>indata)//对海量数据进行遍历执行次数T(n)=N,时间复杂度O(N)
     { 
       HTable.HashCollision(indata);
     }
 }
  
 HTable.WriteToFile();
    for(i=0;i<100;i++)//分别获取100个文件中频率最大的数据,执行次数100*N1(N1为100个文件平均长度),时间复杂度100*O(N1)
 {
  FileData[i]=HTable.GetMaxFreq(file[i]);
    cout<<FileData[i].Value<<" "<<FileData[i].Count<<endl;
 }
 BiSearchMax(FileData,0,99,MaxFreq);//对100个数据进行二路归并查找最大值,时间复杂度O(nlogn)
  cout<<"出现最多的是"<<MaxFreq.Value<<" "<<MaxFreq.Count<<endl;
    
 return 1;
 

}

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值