Loglog算法
1:Initialize M[1],M[2],…M[m] to 0;
2: for each element x read from the file “stream_for_fm.txt”, do the following 3-6:
3:Let h(x) represent the hash value (in binary form) of the element x, let p(y) be the rank of first 1-bit from the right in y(for example, if y=1100 then p(y)=3, if y=1111, then p(y)=1);
4: set j = h(x)%m + 1; //treat h(x) as an integer, then get the bucket id//
5: set w = floor(h(x)/m); //remove the log2(m) least significant bits in h(x)//
6: set M[j] = max(M[j],p(w));
7: set s = (M[1]+M[2]+…+M[m])/m; and return E = 0.39701m2s as the estimate for the number of distinct elements in the file;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <map>
#include <math.h>
#include <time.h>
#define N (int)10000+1
using namespace std;
map<int,int>::iterator iter;
map<int,int> total;
int a,b;
int Tail_first_one(int n);
void Select_max(int *M,int x,int i);
int main()
{
int data;
int M[10000]={0};
int m,i,j,p,h,w;
double sum=0;
double estimate;
printf("输入m大小: ");
scanf("%d",&m);
FILE *fp;
fp = fopen("stream_for_fm.txt","r");
if(fp == NULL)
{
printf("open file falled!");
}
while(!feof(fp))
{
fscanf(fp,"%d",&data);
total[data]++;
}
srand((unsigned)time(NULL));
for(iter = total.begin();iter != total.end();iter++)
{
h = iter->first;
j = h%m+1;
w = h/m;
p = Tail_first_one(w);
Select_max(M,p,j);
}
for(i=1;i<=m;i++)
{
sum += M[i];
}
estimate = 0.39701*m*pow(2,sum/m);
printf("Loglog算法估计值:%f",estimate);
free(fp);
return 0;
}
void Select_max(int *M,int x,int i)
{
M[i] = M[i]>x?M[i]:x;
}
int Tail_first_one(int n)
{
int first_one=0;
while(n != 0)
{
if(n%2 == 0)
{
first_one++;
}
else
{
first_one++;
break;
}
n = n/2;
}
return first_one;
}