/*
Some useful hash function.
It's not a particularly good hash function (<< 5 would be better than << 4),
but people believe in it because it comes from Dragon book.
*/
unsigned int
hashpjw ( const unsigned char * x, unsigned int len) // From Dragon book, p436
{
unsigned int h = 0 ;
unsigned int g;
for (; len > 0 ; len -- )
{
h = (h << 4 ) + * x ++ ;
if ((g = h & 0xf0000000 ) != 0 )
h = (h ^ (g >> 24 )) ^ g;
}
return h;
}
水煮coreutils 之三 完美hash
什么叫做完美的hash?
先可以参考下这篇文章
http://www.ibm.com/developerworks/cn/linux/l-gperf.html?ca=drs-cn
下面是我的理解,一次性计算出输入字符串的位置,没有重复值.复杂的说就是够造一个有限自动机了.
gperf 可以帮助我们做这件事情,第一是用这个比较简单,第二呢就是这个比较酷!
程序员总喜欢弄些奇奇怪怪的东西在代码里显示自己的卓尔不凡,虽然这个被现代软件工程所鄙视,但是自由软件要的就是
自由!在什么地方使用完美hash?命令行解析,前面的链接已经详细说明了,个人感觉在速度要求高,输入集合确定,查询效率要求高的地方使用完美hash比较合适,毕竟天下没有白吃的午餐,总是空间换速度!
用文字表达思想总是那么的不擅长,还是写个例子先,一切尽在不言中啊!
/* C code that goes verbatim in output */
#include < stdio.h >
#include < stdlib.h >
#include < string .h >
% }
struct tl{ const char * name ; const char * s2;};
%%
881050971 , " aaaaa "
881123701 , " bbbbb "
881122057 , " ccccc "
881044216 , " ddddd "
881052006 , " eeeee "
881160639 , " fffff "
881120640 , " ggggg "
%%
int main( int argc, char ** argv)
{
char str[ 32 ];
const struct tl * str2;
while (scanf( " %s " ,str))
{
str2 = in_word_set(str,strlen(str));
printf( " %s " ,str2 -> s2);
}
return 0 ;
}
运行命令:
那来see see tl.c
/* Command-line: gperf -t -L C tlh.gperf */
/* Computed positions: -k'4,8' */
#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35)
&& ( ' % ' == 37 ) && ( ' & ' == 38 ) && ( ' ' ' == 39 ) && ( ' ( ' == 40 )
&& ( ' ) ' == 41 ) && ( ' * ' == 42 ) && ( ' + ' == 43 ) && ( ' , ' == 44 )
&& ( ' - ' == 45 ) && ( ' . ' == 46 ) && ( ' / ' == 47 ) && ( ' 0 ' == 48 )
&& ( ' 1 ' == 49 ) && ( ' 2 ' == 50 ) && ( ' 3 ' == 51 ) && ( ' 4 ' == 52 )
&& ( ' 5 ' == 53 ) && ( ' 6 ' == 54 ) && ( ' 7 ' == 55 ) && ( ' 8 ' == 56 )
&& ( ' 9 ' == 57 ) && ( ' : ' == 58 ) && ( ' ; ' == 59 ) && ( ' < ' == 60 )
&& ( ' = ' == 61 ) && ( ' > ' == 62 ) && ( ' ? ' == 63 ) && ( ' A ' == 65 )
&& ( ' B ' == 66 ) && ( ' C ' == 67 ) && ( ' D ' == 68 ) && ( ' E ' == 69 )
&& ( ' F ' == 70 ) && ( ' G ' == 71 ) && ( ' H ' == 72 ) && ( ' I ' == 73 )
&& ( ' J ' == 74 ) && ( ' K ' == 75 ) && ( ' L ' == 76 ) && ( ' M ' == 77 )
&& ( ' N ' == 78 ) && ( ' O ' == 79 ) && ( ' P ' == 80 ) && ( ' Q ' == 81 )
&& ( ' R ' == 82 ) && ( ' S ' == 83 ) && ( ' T ' == 84 ) && ( ' U ' == 85 )
&& ( ' V ' == 86 ) && ( ' W ' == 87 ) && ( ' X ' == 88 ) && ( ' Y ' == 89 )
&& ( ' Z ' == 90 ) && ( ' [ ' == 91 ) && ( ' / ' == 92 ) && ( ' ] ' == 93 )
&& ( ' ^ ' == 94 ) && ( ' _ ' == 95 ) && ( ' a ' == 97 ) && ( ' b ' == 98 )
&& ( ' c ' == 99 ) && ( ' d ' == 100 ) && ( ' e ' == 101 ) && ( ' f ' == 102 )
&& ( ' g ' == 103 ) && ( ' h ' == 104 ) && ( ' i ' == 105 ) && ( ' j ' == 106 )
&& ( ' k ' == 107 ) && ( ' l ' == 108 ) && ( ' m ' == 109 ) && ( ' n ' == 110 )
&& ( ' o ' == 111 ) && ( ' p ' == 112 ) && ( ' q ' == 113 ) && ( ' r ' == 114 )
&& ( ' s ' == 115 ) && ( ' t ' == 116 ) && ( ' u ' == 117 ) && ( ' v ' == 118 )
&& ( ' w ' == 119 ) && ( ' x ' == 120 ) && ( ' y ' == 121 ) && ( ' z ' == 122 )
&& ( ' { ' == 123 ) && ( ' | ' == 124 ) && ( ' } ' == 125 ) && ( ' ~ ' == 126 ))
/* The character set is not based on ISO-646. */
error " gperf generated tables don't work with this execution character set. Please report a bug to <bug-gnu-gperf@gnu.org>. "
#endif
#line 1 "tlh.gperf"
/* C code that goes verbatim in output */
#include < stdio.h >
#include < stdlib.h >
#include < string .h >
#line 8 "tlh.gperf"
struct tl{ const char * name ; const char * s2;};
#define TOTAL_KEYWORDS 7
#define MIN_WORD_LENGTH 9
#define MAX_WORD_LENGTH 9
#define MIN_HASH_VALUE 9
#define MAX_HASH_VALUE 21
/* maximum key range = 13, duplicates = 0 */
#ifdef __GNUC__
__inline
#else
#ifdef __cplusplus
inline
#endif
#endif
static unsigned int
hash (str, len)
register const char * str;
register unsigned int len;
{
static unsigned char asso_values[] =
{
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 0 , 5 ,
2 , 22 , 4 , 7 , 2 , 22 , 0 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 , 22 ,
22 , 22 , 22 , 22 , 22 , 22 , 22
};
return len + asso_values[(unsigned char )str[ 7 ] + 1 ] + asso_values[(unsigned char )str[ 3 ]];
}
#ifdef __GNUC__
__inline
#ifdef __GNUC_STDC_INLINE__
__attribute__ ((__gnu_inline__))
#endif
#endif
struct tl *
in_word_set (str, len)
register const char * str;
register unsigned int len;
{
static struct tl wordlist[] =
{
{ "" }, { "" }, { "" }, { "" }, { "" }, { "" }, { "" }, { "" }, { "" },
#line 10 "tlh.gperf"
{ " 881050971 " , " aaaaa " },
{ "" },
#line 13 "tlh.gperf"
{ " 881044216 " , " ddddd " },
{ "" }, { "" },
#line 14 "tlh.gperf"
{ " 881052006 " , " eeeee " },
{ "" },
#line 12 "tlh.gperf"
{ " 881122057 " , " ccccc " },
{ "" },
#line 15 "tlh.gperf"
{ " 881160639 " , " fffff " },
#line 11 "tlh.gperf"
{ " 881123701 " , " bbbbb " },
{ "" },
#line 16 "tlh.gperf"
{ " 881120640 " , " ggggg " }
};
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
{
register int key = hash (str, len);
if (key <= MAX_HASH_VALUE && key >= 0 )
{
register const char * s = wordlist[key].name;
if ( * str == * s && ! strcmp (str + 1 , s + 1 ))
return & wordlist[key];
}
}
return 0 ;
}
#line 17 "tlh.gperf"
int main( int argc, char ** argv)
{
char str[ 32 ];
const struct tl * str2;
while (scanf( " %s " ,str))
{
str2 = in_word_set(str,strlen(str));
printf( " %s " ,str2 -> s2);
}
return 0 ;
}
生成代码没有太多出彩的地方,主要就是构建了asso_values 这个静态结构,以什么原则生成这个结构就要看gperf的代码了.
我们注意到,gperf 自动生成了两个函数hash 和 in_word_set ,hash 函数返回一个int值 对应每个key 返回值唯一,
in_word_set 返回key 对应的结构指针.
看gperf的代码的时候注意到它使用的一个hash函数
/*
Some useful hash function.
It's not a particularly good hash function (<< 5 would be better than << 4),
but people believe in it because it comes from Dragon book.
*/
unsigned int
hashpjw ( const unsigned char * x, unsigned int len) // From Dragon book, p436
{
unsigned int h = 0 ;
unsigned int g;
for (; len > 0 ; len -- )
{
h = (h << 4 ) + * x ++ ;
if ((g = h & 0xf0000000 ) != 0 )
h = (h ^ (g >> 24 )) ^ g;
}
return h;
}
有意思的是函数头的注释,不知道 Dragon book 是什么,在google上也没有看出什么来好像很有名的样子.
不行!以后要多写字 少贴些代码.