水煮coreutils 之二 hash
在查找效率要求高的系统中,hash是一个永远不变的话题,查找时间为O(1),把大家的眼光深深的锁定在上面再也走不开.天下没有
白吃的午餐,hash表也是视内存入粪土的东东,浪费内存起来也是大把大把的.空间换时间的思想在这里体现的淋漓尽致.
第一次正式的接触这类东东还是在看偶们公司的代码的时候,以前做这类事情都是用Map,但是Map的效率貌似O(log2 n),而且在耗
内存方面也毫不逊色.所以觉得这个东东还是有点先进的.
等到我看到了hash在coreutils的实现,以前的认识觉得是多么肤浅,真是五岳归来不看山了.
先来个hash函数开胃:
#define SIZE_BITS (sizeof (size_t) * CHAR_BIT)
/* A hash function for NUL-terminated char* strings using
the method described by Bruno Haible.
See http://www.haible.de/bruno/hashfunc.html. */
size_t
hash_pjw ( const void * x, size_t tablesize)
{
const char * s;
size_t h = 0 ;
for (s = x; * s; s ++ )
h = * s + ((h << 9 ) | (h >> (SIZE_BITS - 9 )));
return h % tablesize;
}
性能比较可以查看上面的网站,网上也有一个很好的hash函数效率比较的例子
http://hi.baidu.com/heiyeshuwu/blog/item/e0a49052015ab40d0cf3e3b4.html
关键是实现:
hash 函数虽然是核心,但是核心总是寥寥几行代码,根据不同的应用情况设计hash函数,有时候比这种传统的hash函数更加实用.
我们主要关注的是hash的实现.
我们来看一下关键部分,对于这类代码insert 和 delete操作才是精华部分,其它代码就没有解释的必要了.
to the entry from the table. Otherwise, insert ENTRY and return ENTRY.
Return NULL if the storage required for insertion cannot be allocated. */
void *
hash_insert (Hash_table * table, const void * entry)
{
void * data;
struct hash_entry * bucket;
/* The caller cannot insert a NULL entry. */
if ( ! entry)
abort ();
/* If there's a matching entry already in the table, return that. */
/* 如果对应的key hash到的值有重复 bucket 会有值的 仍然return NULL
参考hash_find_entry的注释:
This private function is used to help with insertion and deletion. When
ENTRY matches an entry in the table, return a pointer to the corresponding
user data and set *BUCKET_HEAD to the head of the selected bucket.
Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in
the table, unlink the matching entry.
*/
if ((data = hash_find_entry (table, entry, & bucket, false )) != NULL)
return data;
/* ENTRY is not matched, it should be inserted. */
if (bucket -> data)
{
struct hash_entry * new_entry = allocate_entry (table);
if (new_entry == NULL)
return NULL;
/* Add ENTRY in the overflow of the bucket. */
// 链表上加一个节点
new_entry -> data = ( void * ) entry;
new_entry -> next = bucket -> next;
bucket -> next = new_entry;
table -> n_entries ++ ;
return ( void * ) entry;
}
/* Add ENTRY right in the bucket head. */
// 直接在bucket上加入数据
bucket -> data = ( void * ) entry;
table -> n_entries ++ ;
table -> n_buckets_used ++ ;
/* If the growth threshold of the buckets in use has been reached, increase
the table size and rehash. There's no point in checking the number of
entries: if the hashing function is ill-conditioned, rehashing is not
likely to improve it. */
if (table -> n_buckets_used
> table -> tuning -> growth_threshold * table -> n_buckets)
{
/* Check more fully, before starting real work. If tuning arguments
became invalid, the second check will rely on proper defaults. */
check_tuning (table);
if (table -> n_buckets_used
> table -> tuning -> growth_threshold * table -> n_buckets)
{
const Hash_tuning * tuning = table -> tuning;
// 计算数组大小
float candidate =
(tuning -> is_n_buckets
? (table -> n_buckets * tuning -> growth_factor)
: (table -> n_buckets * tuning -> growth_factor
* tuning -> growth_threshold));
// # define SIZE_MAX ((size_t) -1) 如果candidate 表达的数字大于size_t就退出
if (SIZE_MAX <= candidate)
return NULL;
/* If the rehash fails, arrange to return NULL. */
// 重新建立hash队列
if ( ! hash_rehash (table, candidate))
entry = NULL;
}
}
return ( void * ) entry;
}
代码本身的注释就很丰富,不用太多说明.
data (the user may want to deallocate its storage). If ENTRY is not in the
table, don't modify the table and return NULL. */
void *
hash_delete (Hash_table * table, const void * entry)
{
void * data;
struct hash_entry * bucket;
// 删除操作在这里就完成了 下面的是进行收缩hash_table的操作
data = hash_find_entry (table, entry, & bucket, true );
if ( ! data)
return NULL;
table -> n_entries -- ;
if ( ! bucket -> data)
{
table -> n_buckets_used -- ;
/* If the shrink threshold of the buckets in use has been reached,
rehash into a smaller table. */
if (table -> n_buckets_used
< table -> tuning -> shrink_threshold * table -> n_buckets)
{
/* Check more fully, before starting real work. If tuning arguments
became invalid, the second check will rely on proper defaults. */
check_tuning (table);
if (table -> n_buckets_used
< table -> tuning -> shrink_threshold * table -> n_buckets)
{
const Hash_tuning * tuning = table -> tuning;
size_t candidate =
(tuning -> is_n_buckets
? table -> n_buckets * tuning -> shrink_factor
: (table -> n_buckets * tuning -> shrink_factor
* tuning -> growth_threshold));
hash_rehash (table, candidate);
}
}
}
return data;
}
我们注意到两个函数都调用了hash_find_entry
hash_find_entry (Hash_table * table, const void * entry,
struct hash_entry ** bucket_head, bool delete)
{
// 计算出entry 在hash_table 中的位置
struct hash_entry * bucket
= table -> bucket + table -> hasher (entry, table -> n_buckets);
struct hash_entry * cursor;
if ( ! (bucket < table -> bucket_limit))
abort ();
* bucket_head = bucket;
/* Test for empty bucket. */
if (bucket -> data == NULL)
return NULL;
/* See if the entry is the first in the bucket. */
if (( * table -> comparator) (entry, bucket -> data))
{
void * data = bucket -> data;
if (delete)
{
if (bucket -> next)
{
struct hash_entry * next = bucket -> next;
/* Bump the first overflow entry into the bucket head, then save
the previous first overflow entry for later recycling. */
* bucket = * next;
// free_entry 只是把值插入到table->free_entry_list 这里有效利用了内存
free_entry (table, next);
}
else
{
bucket -> data = NULL;
}
}
return data;
}
/* Scan the bucket overflow. */
for (cursor = bucket; cursor -> next; cursor = cursor -> next)
{
if (( * table -> comparator) (entry, cursor -> next -> data))
{
void * data = cursor -> next -> data;
if (delete)
{
struct hash_entry * next = cursor -> next;
/* Unlink the entry to delete, then save the freed entry for later
recycling. */
cursor -> next = next -> next;
free_entry (table, next);
}
return data;
}
}
/* No entry found. */
return NULL;
}
对于效率要求更高的的程序来说,收缩队列和扩充队列其实都可以不需要,不过这个就要自己去打造了,看这个代码中流露出一种
思想,函数调用的消耗远小于系统调用的消耗,内存分配,文件打开,读写都是相当耗资源的,反过来边界检查,指针变换,内存拷贝的消耗可以
忽略不计.
附录昨天我写的一个hash函数应用的例子:
#include " ../lib/hash.h "
#include " ../lib/hash-pjw.h "
#include < stdio.h >
#include < stdlib.h >
#include < string .h >
struct tl{
char s1[ 256 ];
char s2[ 256 ];
};
static bool hash_str_cmp( void * s1, void * s2)
{
return strcmp((( struct tl * )s1) -> s1,(( struct tl * )s2) -> s1) == 0 ;
}
static size_t hash_fun( const void * s,size_t t)
{
return hash_pjw((( struct tl * )s) -> s1,t);
}
struct tl * make_tl( char * s)
{
struct tl * tmp = ( struct tl * )malloc( sizeof ( struct tl)) ;
memset(tmp, 0 , sizeof ( struct tl)) ;
char * p = s ;
int n = 0 ;
while ( * p != ' ' )
{
p ++ ;
}
memcpy(tmp -> s1,s,p - s);
while ( * p == ' ' )
{
p ++ ;
}
strcpy(tmp -> s2,p);
return tmp ;
}
int main( int argc , char ** argv)
{
Hash_table * myhash = hash_initialize ( 25026 ,NULL,hash_fun,hash_str_cmp,free);
char str[ 256 ];
struct tl * ptl = NULL ;
FILE * fp = fopen( " tl.log " , " r " );
while (NULL != fgets(str, 256 ,fp))
{
ptl = make_tl(str);
hash_insert(myhash,ptl);
// printf("%s 44444 %s ",ptl->s1,ptl->s2);
}
fclose(fp);
printf( " ----------------------------------------------------- " );
struct tl dd ;
struct tl * s;
strcpy(dd.s1, " 888046 " );
s = hash_lookup (myhash, & dd);
printf( " %s the value %s " ,dd.s1,s -> s2);
hash_print_statistics(myhash,stdout);
return 0 ;
}
提取的文件
888022 881050971
888022 881123701
888026 881122057
888026 881044216
888028 881052006
888028 881160639
888029 881120640
888029 881034717
888031 881161788
888031 881044952
888033 881120535
888033 881024131