java simhash去重,simHash海量去重java实现

simHashList = new ArrayList();

simHashList.add(simHashBuffer.substring(0,this.bitLength/4));

simHashList.add(simHashBuffer.substring(this.bitLength/4,this.bitLength/4*2));

simHashList.add(simHashBuffer.substring(this.bitLength/4*2,this.bitLength/4*3));

simHashList.add(simHashBuffer.substring(this.bitLength/4*3,this.bitLength));

this.shortSimHashList = simHashList;

return simHashList;

}

/**

* 计算每个词的hash

* @date: 2019/2/20

* @param source

* @return java.math.BigInteger

* @throws

*/

private BigInteger hash(String source) {

if (null == source || source.length() == 0) {

return new BigInteger("0");

} else {

char[] sourceArray = source.toCharArray();

BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);

BigInteger m = new BigInteger("1000003");

BigInteger mask = new BigInteger("2").pow(this.bitLength).subtract(new BigInteger("1"));

for (char item : sourceArray) {

BigInteger temp = BigInteger.valueOf((long) item);

x = x.multiply(m).xor(temp).and(mask);

}

x = x.xor(new BigInteger(String.valueOf(source.length())));

if (x.equals(new BigInteger("-1"))) {

x = new BigInteger("-2");

}

return x;

}

}

/**

* 计算hamming距离

* @date: 2019/2/20

* @param bigInteger1

* @param bigInteger2

* @return int

* @throws

*/

public int hammingDistance(BigInteger bigInteger1,BigInteger bigInteger2) {

BigInteger x = bigInteger1.xor(bigInteger2);

int tot = 0;

// 统计x中二进制位数为1的个数

// 我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,

// 我们看n能做多少次这样的操作就OK了。

while (x.signum() != 0) {

tot += 1;

x = x.and(x.subtract(new BigInteger("1")));

}

return tot;

}

/**

* 计算二进制的hamming距离

* @date: 2019/2/20

* @param str1

* @param str2

* @return int

* @throws

*/

public int hammingDistance(String str1, String str2) {

int distance;

if (str1.length() != str2.length()) {

distance = -1;

} else {

distance = 0;

for (int i = 0; i < str1.length(); i++) {

if (str1.charAt(i) != str2.charAt(i)) {

distance++;

}

}

}

return distance;

}

/**

* 根据二进制simHash 获取 intSimHash

* @date: 2019/2/20

* @param strSimHash

* @return java.math.BigInteger

* @throws

*/

public BigInteger getIntSimHash(String strSimHash){

BigInteger fingerprint = new BigInteger("0");

StringBuffer simHashBuffer = new StringBuffer();

char[] hashValue = strSimHash.toCharArray();

for (int i = 0; i < this.bitLength; i++) {

if (hashValue[i] == '1') {

fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));

}

}

return fingerprint;

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值