php, apache, perl, bsddb都使用time33哈希.
最简单的版本
uint32_t time33(char const *str, int len)
{
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = hash *33 + (unsigned long) str[i];
}
return hash;
}
这个版本最可以体现time33的算法思路,够简单。
把乘法操作换成位操作
unsigned long time33(char const *str, int len){
unsigned long hash = 0;
for (int i = 0; i < len; i++) {
hash = ((hash <<5) + hash) + (unsigned long) str[i];
}
return hash;
}
59个字符1000 0000次运行(gcc没有开启优化,因为开了优化后两个函数的实际代码会一样)
第一个:
real 0m4.389s
user 0m4.388s
sys 0m0.000s
第二个:
real 0m4.137s
user 0m4.120s
sys 0m0.000s
gcc –O2优化后是
real 0m1.367s
user 0m1.360s
sys 0m0.000s
php版本
inline unsigned time33(char const*str, int len)
{
unsigned long hash = 5381;
/* variant with the hash unrolled eight times */
for (; len >= 8; len -= 8) {
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
hash = ((hash << 5) + hash) + *str++;
}
switch (len) {
case 7: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 6: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 5: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 4: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 3: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 2: hash = ((hash << 5) + hash) + *str++; /* fallthrough... */
case 1: hash = ((hash << 5) + hash) + *str++; break;
case 0: break;
}
return hash;
}
59个字符,1000 0000次
real 0m1.088s
user 0m1.068s
sys 0m0.000s
速度提升主要在循环展开, 对于短字符,这个是不明显的。
php版本的hash初始值是5381, 这个
Magic Constant 5381:
1. odd number
2. prime number
3. deficient number
4. 001/010/100/000/101 b
Apache版本
unsigned long time33(char const *str, int *len)
{
unsigned long hash = 0;const char *p=str;
if (*len<=0) {
for(p = str; *p; p++) {
hash = hash * 33 + *p;
}
*len = p - str;
}
else {
int i = *len;
for (p = str;i; i--, p++) {
hash = hash * 33 + *p;
}
}
return hash;
}
测试结果
real 0m1.418s
user 0m1.412s
sys 0m0.004s
综上,我的改进版本
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
//php版本
unsigned long time33(char const *str, int len=-1)
{
unsigned long hash = 5381;
/* variant with the hash unrolled eight times */
char const *p = str;
if (unlikely(len<0)) {
for(; *p; p++) {
hash = hash * 33 + *p;
}
return hash;
}#define TIME33_HASH_MIXED_CH() hash = ((hash<<5)+hash) + *p++
for (; len >= 8; len -= 8) {
TIME33_HASH_MIXED_CH(); //1
TIME33_HASH_MIXED_CH(); //2
TIME33_HASH_MIXED_CH(); //3
TIME33_HASH_MIXED_CH(); //4
TIME33_HASH_MIXED_CH(); //5
TIME33_HASH_MIXED_CH(); //6
TIME33_HASH_MIXED_CH(); //7
TIME33_HASH_MIXED_CH(); //8
}
switch (len) {
case 7: TIME33_HASH_MIXED_CH(); /* fallthrough... */
case 6: TIME33_HASH_MIXED_CH(); /* fallthrough... */
case 5: TIME33_HASH_MIXED_CH(); /* fallthrough... */
case 4: TIME33_HASH_MIXED_CH(); /* fallthrough... */
case 3: TIME33_HASH_MIXED_CH(); /* fallthrough... */
case 2: TIME33_HASH_MIXED_CH(); /* fallthrough... */
case 1: TIME33_HASH_MIXED_CH(); break;
case 0: break;
}
return hash;
}#undef TIME33_HASH_MIXED_CH
测试结果
real 0m1.072s
user 0m1.064s
sys 0m0.000s
测试过, 重复率在 1/2000。
为什么是33的倍数, PHP中注释是
- DJBX33A (Daniel J. Bernstein, Times 33 with Addition)
- This is Daniel J. Bernstein's popular `times 33' hash function as
- posted by him years ago on comp.lang.c. It basically uses a function
- like ``hash(i) = hash(i-1) * 33 + str[i]''. This is one of the best
- known hash functions for strings. Because it is both computed very
- fast and distributes very well.
- The magic of number 33, i.e. why it works better than many other
- constants, prime or not, has never been adequately explained by
- anyone. So I try an explanation: if one experimentally tests all
- multipliers between 1 and 256 (as RSE did now) one detects that even
- numbers are not useable at all. The remaining 128 odd numbers
- (except for the number 1) work more or less all equally well. They
- all distribute in an acceptable way and this way fill a hash table
- with an average percent of approx. 86%.
- If one compares the Chi^2 values of the variants, the number 33 not
- even has the best value. But the number 33 and a few other equally
- good numbers like 17, 31, 63, 127 and 129 have nevertheless a great
- advantage to the remaining numbers in the large set of possible
- multipliers: their multiply operation can be replaced by a faster
- operation based on just one shift plus either a single addition
- or subtraction operation. And because a hash function has to both
- distribute good _and_ has to be very fast to compute, those few
- numbers should be preferred and seems to be the reason why Daniel J.
- Bernstein also preferred it.
- -- Ralf S. Engelschall rse@engelschall.com
其它倍数
Ngix使用的是 time31
Tokyo Cabinet使用的是 time37
Bob在他的文章说,小写英文词汇适合33, 大小写混合使用65。time33比较适合的是英文词汇的hash.