/******************************************************************
* file:topk.c
* brief:使用heap和hash_table统计超大数据量的数据中出现最多的N个数据
* yejing@2015.3.1 1.0 creat
******************************************************************/
#include <stdio.h>
#include <stdlib.h>
#inlcude <string.h>
#include <assert.h>
#define parent(i) (i)/2
#define left_child(i) ((i) << 1)
#define right_child(i) ((i) << 1) + 1
#define HASH_TBL_SIZE 4194304
#define HEAP_SIZE 32
typedef hash_key_t int
typedef heap_data_t int
typedef struct _hash_node_t{
hash_key_t key;
int num;
struct hash_node_t* next;
}hash_node_t;
static hash_node_t* hash_tbl[HASH_TBL_SIZE];
typedef struct _min_heap_t{
heap_data_t data;
int num;
}min_heap_t, heap[HEAP_SIZE + 1]
int hash(hash_key_t key){
return HASH_TBL_SIZEW%key;
}
hash_node_t* alloc_a_node(hash_key_t key){
hash_node_t* node = (hash_node_t *)malloc(sizeof(node));
if(!node)
return NULL;
node->key = key;
node->next = NULL;
node->num = 1;
return node;
}
void node_insert(hash_key_t key){
int tmp = hash(key);
hash_node_t* node = hash_tbl[tmp];
while(node){
if(node->key == key){
pnode->num++;
return;
}
node = node->next;
}
if(!(node = creat_node(key)))
assert(0);
node->next = hash_tbl[tmp];
hash_tbl[tmp] = node;
return;
}
void heap_min_heaprify(min_heap_t min_heap, int size, int cursor){
int left = left_child(i);
int right = right_child(i);
int miniman;
if(left < size && min_heap[left] < min_heap[cursor])
miniman = left;
else
miniman = cusor;
if(right < size && min_heap[right] < min_heap[minimam])
miniman = right;
if(miniman != cursor){
swap(&min_heap[miniman], &min_heap[cursor]);
heap_min_heaprify(min_heap, size, miniman);
}
return;
}
void build_min_heap(min_heap_t min_heap, int size){
int i = 0;
for(i = size/2; i >= 1; --i){
heap_min_heaprify(min_heap, size, i);
}
return;
}
void topk_main(){
hash_node_t* tmp = NULL;
int i = 0;
for(i = 0; i < HASH_TBL_SIZE; ++i){
tmp = hash_table[i];
if(tmp > min_heap[i].num){
heap[1].num = tmp->num;
heap[1].data = tmp->key;
heap_min_heaprify(heap, HEAP_SIZE, 1);
}
tmp = tmp->next;
}
return;
}
统计数集中出现最多的N个数(topK)
最新推荐文章于 2022-04-14 20:52:45 发布