Clickhouse cityHash64 Java 实现

        cityHash是Google发布的字符串散列算法(http://www.cityhash.org.uk),包括cityHash64和cityHash128,分别计算字串的64/128位散列值,其性能较其它Hash函数略胜一筹。Clickhouse中提供了cityHash算法实现,典型应用是计算CheckSum。

        Clickhouse的cityHash64算法与Google的cityHash64算法具体实现上存在差异,计算的散列值不一致。我根据ClickHouse/contrib/cityhash102/src/city.cc源代码用Java语言重新实现了Clickhouse cityHash64算法,经过大量的数据测试其结果和Clickhouse cityHash64()完全一致。

package com.jlicbc.downloadReport.util;

/**
 * Clickhouse cityHash64 Java实现
 * @author lvxueshi
 *
 */
public class CityHash64 {
 
    private static final long k0 = 0xc3a5c85c97cb3127L;
    private static final long k1 = 0xb492b66fbe98f273L;
    private static final long k2 = 0x9ae16a3b2f90404fL;
    private static final long k3 = 0xc949d7c7509e6557L;
 
    /**
     * byte[]转long,小端在前
     * @param b
     * @param i
     * @return
     */
    private static long toLongLE(byte[] b, int i) {
        return (((long)b[i+7] << 56) +
                ((long)(b[i+6] & 255) << 48) +
                ((long)(b[i+5] & 255) << 40) +
                ((long)(b[i+4] & 255) << 32) +
                ((long)(b[i+3] & 255) << 24) +
                ((b[i+2] & 255) << 16) +
                ((b[i+1] & 255) <<  8) +
                ((b[i+0] & 255) <<  0));
    }
    private static long toIntLE(byte[] b, int i) {
        return (((b[i+3] & 255L) << 24) + ((b[i+2] & 255L) << 16) + ((b[i+1] & 255L) << 8) + ((b[i+0] & 255L) << 0));
    }
 
    private static long fetch64(byte[] s, int pos) {
        return toLongLE(s, pos);
    }
 
    private static long fetch32(byte[] s, int pos) {
        return toIntLE(s, pos);
    }
 
    private static int staticCastToInt(byte b) {
        return b & 0xFF;
    }
 
    private static long rotate(long val, int shift) {
    	// Avoid shifting by 64: doing so yields an undefined result.
        return shift == 0 ? val : (val >>> shift) | (val << (64 - shift));
    }
 
 // Equivalent to Rotate(), but requires the second arg to be non-zero.
 // On x86-64, and probably others, it's possible for this to compile
 // to a single instruction if both args are already in registers.
 
    private static long rotateByAtLeast1(long val, int shift) {
        return (val >>> shift) | (val << (64 - shift));
    }
 
    private static long shiftMix(long val) {
        return val ^ (val >>> 47);
    }
 
    private static final long kMul = 0x9ddfea08eb382d69L;
    private static long hash128to64(long u, long v) {
        long a = (u ^ v) * kMul;
        a ^= (a >>> 47);
//        long b = (u ^ a) * kMul;
        long b = (v ^ a) * kMul;
        b ^= (b >>> 47);
        b *= kMul;
        return b;
    }
 
    private static long hashLen16(long u, long v) {
        return hash128to64(u, v);
    }
 
 
    private static long hashLen0to16(byte[] s, int pos, int len) {
        if (len > 8) {
        	long a = fetch64(s,pos );
        	 long b = fetch64(s, pos + len - 8);
            
             return hashLen16(a,rotateByAtLeast1(b + len,len)) ^ b;
        }
        
        if (len >= 4) {
            long a = fetch32(s, pos );
            return hashLen16(len + (a << 3) , fetch32(s, pos + len - 4));
        }
        
        if (len > 0) {
            byte a = s[pos];
            byte b = s[pos + (len >>> 1)];
            byte c = s[pos + len - 1];
            int y = staticCastToInt(a) + (staticCastToInt(b) << 8);
            int z = len + (staticCastToInt(c) << 2);
            return shiftMix(y * k2 ^ z * k3) * k2;
        }
        
        return k2;
        
        
      
    }
 
    // This probably works well for 16-byte strings as well, but it may be overkill
    // in that case.
    private static long hashLen17to32(byte[] s, int pos, int len){
        long a = fetch64(s, pos) * k1;
        long b = fetch64(s, pos+8);
        long c = fetch64(s, pos+len - 8) * k2;
        long d = fetch64(s,  pos+len - 16) * k0;
        return hashLen16(rotate(a - b, 43) + rotate(c, 30) + d,
                a + rotate(b ^ k3, 20) - c + len);
        
    }
 
 

 
    private static long hashLen33to64(byte[] s, int pos, int len) {
    	long z = fetch64(s,pos + 24);
        long a = fetch64(s, pos) + (len + fetch64(s, pos + len - 16)) * k0;

        long b = rotate(a + z,52);
        long c = rotate(a,37);
        a += fetch64(s,pos + 8);
        c += rotate(a,7);
        a += fetch64(s,pos + 16);
        long vf = a + z;
        long vs = b + rotate(a, 31) + c;
        a = fetch64(s,pos + 16) + fetch64(s , len - 32);
        z = fetch64(s , len - 8);
        b = rotate(a + z, 52);
        c = rotate(a, 37);
        a += fetch64(s , len - 24);
        c += rotate(a, 7);
        a += fetch64(s , len - 16);
        long wf = a + z;
        long ws = b + rotate(a, 31) + c;
        long r = shiftMix((vf + ws) * k2 + (wf + vs) * k0);
        return shiftMix(r * k0 + vs) * k2;
    }
 
 
    /**
     * cityHash64
     * @param s
     * @param pos
     * @param len
     * @return
     */
    public static long cityHash64(byte[] s, int pos, int len) {
        if (len <= 32) {
            if (len <= 16) {
                return hashLen0to16(s, pos, len);
            } else {
                return hashLen17to32(s, pos, len);
            }
        } else if (len <= 64) {
            return hashLen33to64(s, pos, len);
        }
 
     // For strings over 64 bytes we hash the end first, and then as we
        // loop we keep 56 bytes of state: v, w, x, y, and z.
        long x = fetch64(s,pos);
        long y = fetch64(s ,pos + len - 16) ^ k1;
        long z = fetch64(s,pos + len - 56) ^ k0;
        long [] v = weakHashLen32WithSeeds(s, pos + len - 64, len, y);
        long [] w = weakHashLen32WithSeeds(s, pos + len - 32, len * k1, k0);
        z += shiftMix(v[1]) * k1;
        x = rotate(z + x, 39) * k1;
        y = rotate(y, 33) * k1;
        
        
        // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
        len = (len - 1) & ~staticCastToInt((byte)63);;
        do {
          x = rotate(x + y + v[0] + fetch64(s,pos + 16), 37) * k1;
          y = rotate(y + v[1] + fetch64(s,pos + 48), 42) * k1;
          x ^= w[1];
          y ^= v[0];
          z = rotate(z ^ w[0], 33);
          v = weakHashLen32WithSeeds(s,pos, v[1] * k1, x + w[0]);
          w = weakHashLen32WithSeeds(s,pos + 32, z + w[1], y);
          long tmp = x;
          x = z;
          z = tmp;
          pos += 64;
          len -= 64;
        } while (len != 0);
        
        
        return hashLen16(hashLen16(v[0], w[0]) + shiftMix(y) * k1 + z,
                         hashLen16(v[1], w[1]) + x);
    }//cityHash64
 
 
 
    private static long[] weakHashLen32WithSeeds(
            long w, long x, long y, long z,
            long a, long b) {
 
        a += w;
        b = rotate(b + a + z, 21);
        long c = a;
        a += x;
        a += y;
        b += rotate(a, 44);
        return new long[]{ a + z, b + c };
        
    }
 
 
    
    	// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
    private static long[] weakHashLen32WithSeeds(byte[] s, int pos, long a, long b) {
        return weakHashLen32WithSeeds(
                fetch64(s, pos + 0),
                fetch64(s, pos + 8),
                fetch64(s, pos + 16),
                fetch64(s, pos + 24),
                a,
                b
        );
    }
}

测试代码如下:

public static void main(String[] args) {
     long ls = CityHash64.cityHash64("16".getBytes(), 0, 2);
	 System.out.println("64cityHash:" + DataType.toUnsignedLong(ls));
}

其中DataType.toUnsignedLong()方法是将Java的long类型转换成无符号的long,具体实现代码如下:

public static BigDecimal toUnsignedLong(long value) {
		if ( value >= 0)
			return new BigDecimal(value);
		
		long lowValue = value & 0x7FFFFFFFFFFFFFFFL;
		return BigDecimal.valueOf(lowValue)
				.add(BigDecimal.valueOf(Long.MAX_VALUE).add(BigDecimal.valueOf(1)));
	}

上述测试代码运行结果为:

64cityHash:696724486834661759

Clickhouse自带的CityHash64算法:

ubuntu :) SELECT cityHash64('16') AS CityHash, toTypeName(CityHash) AS type;

SELECT
    cityHash64('16') AS CityHash,
    toTypeName(CityHash) AS type

┌───────────CityHash─┬─type───┐
696724486834661759         │ UInt64     │
└──────────── ─────┴────────┘

1 rows in set. Elapsed: 0.001 sec.


 



  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值