simHashList = new ArrayList();
simHashList.add(simHashBuffer.substring(0,this.bitLength/4));
simHashList.add(simHashBuffer.substring(this.bitLength/4,this.bitLength/4*2));
simHashList.add(simHashBuffer.substring(this.bitLength/4*2,this.bitLength/4*3));
simHashList.add(simHashBuffer.substring(this.bitLength/4*3,this.bitLength));
this.shortSimHashList = simHashList;
return simHashList;
}
/**
* 计算每个词的hash
* @date: 2019/2/20
* @param source
* @return java.math.BigInteger
* @throws
*/
private BigInteger hash(String source) {
if (null == source || source.length() == 0) {
return new BigInteger("0");
} else {
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.bitLength).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
/**
* 计算hamming距离
* @date: 2019/2/20
* @param bigInteger1
* @param bigInteger2
* @return int
* @throws
*/
public int hammingDistance(BigInteger bigInteger1,BigInteger bigInteger2) {
BigInteger x = bigInteger1.xor(bigInteger2);
int tot = 0;
// 统计x中二进制位数为1的个数
// 我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,对吧,然后,n&(n-1)就相当于把后面的数字清0,
// 我们看n能做多少次这样的操作就OK了。
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
/**
* 计算二进制的hamming距离
* @date: 2019/2/20
* @param str1
* @param str2
* @return int
* @throws
*/
public int hammingDistance(String str1, String str2) {
int distance;
if (str1.length() != str2.length()) {
distance = -1;
} else {
distance = 0;
for (int i = 0; i < str1.length(); i++) {
if (str1.charAt(i) != str2.charAt(i)) {
distance++;
}
}
}
return distance;
}
/**
* 根据二进制simHash 获取 intSimHash
* @date: 2019/2/20
* @param strSimHash
* @return java.math.BigInteger
* @throws
*/
public BigInteger getIntSimHash(String strSimHash){
BigInteger fingerprint = new BigInteger("0");
StringBuffer simHashBuffer = new StringBuffer();
char[] hashValue = strSimHash.toCharArray();
for (int i = 0; i < this.bitLength; i++) {
if (hashValue[i] == '1') {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
}