水煮coreutils 之二 hash

最新推荐文章于 2022-08-14 11:56:24 发布

tianjieshui

最新推荐文章于 2022-08-14 11:56:24 发布

阅读量763

点赞数

文章标签： table struct null delete insert statistics

本文链接：https://blog.csdn.net/tianjieshui/article/details/1932603

版权

水煮coreutils 之二 hash
在查找效率要求高的系统中,hash是一个永远不变的话题,查找时间为O(1),把大家的眼光深深的锁定在上面再也走不开.天下没有
白吃的午餐,hash表也是视内存入粪土的东东,浪费内存起来也是大把大把的.空间换时间的思想在这里体现的淋漓尽致.
第一次正式的接触这类东东还是在看偶们公司的代码的时候,以前做这类事情都是用Map,但是Map的效率貌似O(log2 n),而且在耗
内存方面也毫不逊色.所以觉得这个东东还是有点先进的.
    等到我看到了hash在coreutils的实现,以前的认识觉得是多么肤浅,真是五岳归来不看山了.
    先来个hash函数开胃:

  #include < limits.h >
      #define SIZE_BITS (sizeof (size_t) * CHAR_BIT)

/* A hash function for NUL-terminated char* strings using
   the method described by Bruno Haible.
   See http://www.haible.de/bruno/hashfunc.html.    */

size_t
hash_pjw ( const void * x, size_t tablesize)
{
   const char * s;
  size_t h = 0 ;

   for (s = x; * s; s ++ )
    h = * s + ((h << 9 ) | (h >> (SIZE_BITS - 9 )));

   return h % tablesize;
}

性能比较可以查看上面的网站,网上也有一个很好的hash函数效率比较的例子
http://hi.baidu.com/heiyeshuwu/blog/item/e0a49052015ab40d0cf3e3b4.html
关键是实现:
hash 函数虽然是核心,但是核心总是寥寥几行代码,根据不同的应用情况设计hash函数,有时候比这种传统的hash函数更加实用.
我们主要关注的是hash的实现.
我们来看一下关键部分,对于这类代码insert 和 delete操作才是精华部分,其它代码就没有解释的必要了.

/* If ENTRY matches an entry already in the hash table, return the pointer
   to the entry from the table.  Otherwise, insert ENTRY and return ENTRY.
   Return NULL if the storage required for insertion cannot be allocated.   */

void *
hash_insert (Hash_table * table, const void * entry)
{
   void * data;
   struct hash_entry * bucket;

   /* The caller cannot insert a NULL entry.   */
   if ( ! entry)
    abort ();

   /* If there's a matching entry already in the table, return that.   */
   /* 如果对应的key hash到的值有重复  bucket 会有值的仍然return NULL
  参考hash_find_entry的注释:
  This private function is used to help with insertion and deletion.  When
   ENTRY matches an entry in the table, return a pointer to the corresponding
   user data and set *BUCKET_HEAD to the head of the selected bucket.
   Otherwise, return NULL.  When DELETE is true and ENTRY matches an entry in
   the table, unlink the matching entry.
      */
   if ((data = hash_find_entry (table, entry, & bucket, false )) != NULL)
     return data;

   /* ENTRY is not matched, it should be inserted.   */

   if (bucket -> data)
    {
       struct hash_entry * new_entry = allocate_entry (table);

       if (new_entry == NULL)
return NULL;

       /* Add ENTRY in the overflow of the bucket.   */
       // 链表上加一个节点
      new_entry -> data = ( void * ) entry;
      new_entry -> next = bucket -> next;
      bucket -> next = new_entry;
      table -> n_entries ++ ;
       return ( void * ) entry;
    }

   /* Add ENTRY right in the bucket head.   */
    // 直接在bucket上加入数据
  bucket -> data = ( void * ) entry;
  table -> n_entries ++ ;
  table -> n_buckets_used ++ ;

   /* If the growth threshold of the buckets in use has been reached, increase
     the table size and rehash.  There's no point in checking the number of
     entries:  if the hashing function is ill-conditioned, rehashing is not
     likely to improve it.   */

   if (table -> n_buckets_used
       > table -> tuning -> growth_threshold * table -> n_buckets)
    {
       /* Check more fully, before starting real work.  If tuning arguments
  became invalid, the second check will rely on proper defaults.   */
      check_tuning (table);
       if (table -> n_buckets_used
    > table -> tuning -> growth_threshold * table -> n_buckets)
{
    const Hash_tuning * tuning = table -> tuning;
    // 计算数组大小
    float candidate =
     (tuning -> is_n_buckets
       ? (table -> n_buckets * tuning -> growth_factor)
      : (table -> n_buckets * tuning -> growth_factor
   * tuning -> growth_threshold));
// # define SIZE_MAX ((size_t) -1) 如果candidate 表达的数字大于size_t就退出
    if (SIZE_MAX <= candidate)
      return NULL;

    /* If the rehash fails, arrange to return NULL.   */
    // 重新建立hash队列
    if ( ! hash_rehash (table, candidate))
     entry = NULL;
}
    }

   return ( void * ) entry;
}

代码本身的注释就很丰富,不用太多说明.

/* If ENTRY is already in the table, remove it and return the just-deleted
   data (the user may want to deallocate its storage).  If ENTRY is not in the
   table, don't modify the table and return NULL.   */

void *
hash_delete (Hash_table * table, const void * entry)
{
   void * data;
   struct hash_entry * bucket;
// 删除操作在这里就完成了下面的是进行收缩hash_table的操作
  data = hash_find_entry (table, entry, & bucket, true );
   if ( ! data)
     return NULL;

  table -> n_entries -- ;
   if ( ! bucket -> data)
    {
      table -> n_buckets_used -- ;

       /* If the shrink threshold of the buckets in use has been reached,
  rehash into a smaller table.   */

       if (table -> n_buckets_used
    < table -> tuning -> shrink_threshold * table -> n_buckets)
{
    /* Check more fully, before starting real work.  If tuning arguments
      became invalid, the second check will rely on proper defaults.   */
   check_tuning (table);
    if (table -> n_buckets_used
        < table -> tuning -> shrink_threshold * table -> n_buckets)
     {
        const Hash_tuning * tuning = table -> tuning;
       size_t candidate =
  (tuning -> is_n_buckets
    ? table -> n_buckets * tuning -> shrink_factor
   : (table -> n_buckets * tuning -> shrink_factor
       * tuning -> growth_threshold));

       hash_rehash (table, candidate);
     }
}
    }

   return data;
}

我们注意到两个函数都调用了hash_find_entry

static void *
hash_find_entry (Hash_table * table, const void * entry,
    struct hash_entry ** bucket_head, bool delete)
{
// 计算出entry 在hash_table 中的位置
   struct hash_entry * bucket
     = table -> bucket + table -> hasher (entry, table -> n_buckets);
   struct hash_entry * cursor;

   if ( ! (bucket < table -> bucket_limit))
    abort ();

   * bucket_head = bucket;

   /* Test for empty bucket.   */
   if (bucket -> data == NULL)
     return NULL;

   /* See if the entry is the first in the bucket.   */
   if (( * table -> comparator) (entry, bucket -> data))
    {
       void * data = bucket -> data;

       if (delete)
{
    if (bucket -> next)
     {
        struct hash_entry * next = bucket -> next;

        /* Bump the first overflow entry into the bucket head, then save
   the previous first overflow entry for later recycling.   */
        * bucket = * next;
     // free_entry 只是把值插入到table->free_entry_list 这里有效利用了内存
       free_entry (table, next);
     }
    else
     {
       bucket -> data = NULL;
     }
}

       return data;
    }

   /* Scan the bucket overflow.   */
   for (cursor = bucket; cursor -> next; cursor = cursor -> next)
    {
       if (( * table -> comparator) (entry, cursor -> next -> data))
{
    void * data = cursor -> next -> data;

    if (delete)
     {
        struct hash_entry * next = cursor -> next;

        /* Unlink the entry to delete, then save the freed entry for later
   recycling.   */
       cursor -> next = next -> next;
       free_entry (table, next);
     }

    return data;
}
    }

   /* No entry found.   */
   return NULL;
}

对于效率要求更高的的程序来说,收缩队列和扩充队列其实都可以不需要,不过这个就要自己去打造了,看这个代码中流露出一种
思想,函数调用的消耗远小于系统调用的消耗,内存分配,文件打开,读写都是相当耗资源的,反过来边界检查,指针变换,内存拷贝的消耗可以
忽略不计.

附录昨天我写的一个hash函数应用的例子:

#define USE_DIFF_HASH 1
#include " ../lib/hash.h "
#include " ../lib/hash-pjw.h "
#include < stdio.h >
#include < stdlib.h >
#include < string .h >

struct tl{
char s1[ 256 ];
char s2[ 256 ];
};

static bool hash_str_cmp( void * s1, void * s2)
{
    return strcmp((( struct tl * )s1) -> s1,(( struct tl * )s2) -> s1) == 0 ;
}

static size_t hash_fun( const void * s,size_t t)
{
        return hash_pjw((( struct   tl * )s) -> s1,t);
}

struct tl * make_tl( char * s)
{
   struct tl * tmp = ( struct tl * )malloc( sizeof ( struct tl)) ;
  memset(tmp, 0 , sizeof ( struct tl)) ;
   char * p = s ;
   int n = 0 ;
   while ( * p != ' ' )
{
        p ++ ;
}
        memcpy(tmp -> s1,s,p - s);

         while ( * p == ' ' )
         {
                p ++ ;
        }
        strcpy(tmp -> s2,p);
         return tmp ;
}
int main( int argc , char ** argv)
{

Hash_table * myhash = hash_initialize ( 25026 ,NULL,hash_fun,hash_str_cmp,free);
char str[ 256 ];
struct tl * ptl = NULL ;
FILE * fp = fopen( " tl.log " , " r " );
while (NULL != fgets(str, 256 ,fp))
{
        ptl = make_tl(str);
        hash_insert(myhash,ptl);
//       printf("%s 44444 %s ",ptl->s1,ptl->s2);
}
fclose(fp);
printf( " ----------------------------------------------------- " );
struct tl dd ;
struct tl * s;
strcpy(dd.s1, " 888046 " );
s = hash_lookup (myhash, & dd);
printf( " %s the value %s " ,dd.s1,s -> s2);
hash_print_statistics(myhash,stdout);
return 0 ;
}

提取的文件

root # [ / root / coreutils- 5.2 . 1 / tltest] more tl . log

888022 881050971

888022 881123701

888026 881122057

888026 881044216

888028 881052006

888028 881160639

888029 881120640

888029 881034717

888031 881161788

888031 881044952

888033 881120535

888033 881024131

tianjieshui

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
水煮coreutils 之二 hash

水煮coreutils 之二 hash 在查找效率要求高的系统中,hash是一个永远不变的话题,查找时间为O(1),把大家的眼光深深的锁定在上面再也走不开.天下没有白吃的午餐,hash表也是视内存入粪土的东东,浪费内存起来也是大把大把的.空间换时间的思想在这里体现的淋漓尽致. 第一次正式的接触这类东东还是在看偶们公司的代码的时候,以前做这类事情都是用Map,但是Map的效率貌似O(log2 n),
复制链接

扫一扫