通过基数估值的方法来得到大量数据中重复的列。
算法步骤:随机生成n多数据,利用murmurhash,得到32位的hash值,通过2 de 10分桶,来计算
package test;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
class jishuguzhi {
public static void main(String args[])
{
engine en=new engine();
en.jisuan();
}
}
class engine
{
HashMap<Integer,Integer> map=new HashMap<Integer, Integer>();
MurmurHash m=new MurmurHash();
public long gethash(byte [] data,int length,int seed)
{
return m.hash32(data, length, seed);
}
public void jisuan()
{
//分桶数为pow(2,10);
long top=100000000000l;
while(top-->=0)
{
Random r=new Random();
long x=r.nextLong();
byte[]data= String.valueOf(x).getBytes();
int length=data.length;
long iwant=this.gethash(data,length ,33333333);
//获得二进制字符串。
String iwantyou=Long.toBinaryString(iwant);
int distance=iwantyou.length()-10;
String tongn;
if(distance>=0)
{
tongn=iwantyou.substring(0, 10-1);
}
else
{
tongn=iwantyou.substring(0, iwantyou.length()-1);
}
//位数不够就在后面加0;
int shi=Integer.valueOf(tongn,2);
System.out.println(tongn);
if(map.containsKey(shi))
{
this.map.put(Integer.valueOf(tongn), this.map.get(shi)+1);
}
else
{
this.map.put(Integer.valueOf(tongn), 1);
}
}
//迭代出map的值。
Iterator it = map.keySet().iterator();
while(it.hasNext()){
Integer key = (Integer) it.next();
int value = map.get(key);
System.out.println(key + "→" + value);
}
}
}
final class MurmurHash {
private static final long serialVersionUID = 4342869264396184799L;
// all methods static; private constructor.
public MurmurHash() {
}
protected byte[] toBytesWithoutEncoding(String str) {
int len = str.length();
int pos = 0;
byte[] buf = new byte[len << 1];
for (int i = 0; i < len; i++) {
char c = str.charAt(i);
buf[pos++] = (byte) (c & 0xFF);
buf[pos++] = (byte) (c >> 8);
}
return buf;
}
public int hashcode(String str) {
byte[] bytes = toBytesWithoutEncoding(str);
return hash32(bytes, bytes.length);
}
/**
* Generates 32 bit hash from byte array of the given length and
* seed.
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 32 bit hash of the given array
*/
public int hash32( final byte[] data, int length, int seed) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
final int m = 0x5bd1e995;
final int r = 24;
// Initialize the hash to a random value
int h = seed^length;
int length4 = length/4;
for (int i=0; i<length4; i++) {
final int i4 = i*4;
int k = (data[i4+0]&0xff) +((data[i4+1]&0xff)<<8)
+((data[i4+2]&0xff)<<16) +((data[i4+3]&0xff)<<24);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
// Handle the last few bytes of the input array
switch (length%4) {
case 3: h ^= (data[(length&~3) +2]&0xff) << 16;
case 2: h ^= (data[(length&~3) +1]&0xff) << 8;
case 1: h ^= (data[length&~3]&0xff);
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
/**
* Generates 32 bit hash from byte array with default seed value.
*
* @param data byte array to hash
* @param length length of the array to hash
* @return 32 bit hash of the given array
*/
public int hash32( final byte[] data, int length) {
return hash32( data, length, 0x9747b28c);
}
/**
* Generates 64 bit hash from byte array of the given length and seed.
*
* @param data byte array to hash
* @param length length of the array to hash
* @param seed initial seed value
* @return 64 bit hash of the given array
*/
public long hash64( final byte[] data, int length, int seed) {
final long m = 0xc6a4a7935bd1e995L;
final int r = 47;
long h = (seed&0xffffffffl)^(length*m);
int length8 = length/8;
for (int i=0; i<length8; i++) {
final int i8 = i*8;
long k = ((long)data[i8+0]&0xff) +(((long)data[i8+1]&0xff)<<8)
+(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)
+(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)
+(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);
k *= m;
k ^= k >>> r;
k *= m;
h ^= k;
h *= m;
}
switch (length%8) {
case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
case 1: h ^= (long)(data[length&~7]&0xff);
h *= m;
};
h ^= h >>> r;
h *= m;
h ^= h >>> r;
return h;
}
/**
* Generates 64 bit hash from byte array with default seed value.
*
* @param data byte array to hash
* @param length length of the array to hash
* @return 64 bit hash of the given string
*/
public long hash64( final byte[] data, int length) {
return hash64( data, length, 0xe17a1465);
}
}