1. 概览
vocab_count.c的功能就是生成词典。它的输入是整个语料,它的输出是词典。词典的形式是单词以及单词在语料中出现的次数(如下表)。
//vocab.txt
the 1061396
of 593677
and 14567
输出的键值对是按照频数从高到低排好序的。由于C语言中没有dict这个的现成的数据结构,需要用C语言自己写一个dict。
2. 源码分析
首先看一下GloVe是如何存储单词的
typedef struct vocabulary { //单词的一个单元
char *word;
long long count;
} VOCAB;
typedef struct hashrec { //用于存储上述单元的哈希表
char *word;
long long count;
struct hashrec *next;
} HASHREC;
这里存了单词的字符串和频数。和word2vec类似,在这份儿代码中一样需要解决哈希冲突的问题。GloVe中采取的策略是使用链表。相同哈希的单词会通过链表串联起来。所以有了HASHREC这个类型。下面是几个比较函数,用于词典中单词排序用的。
/* 实现字符串比较 */
int scmp( char *s1, char *s2 ) {
while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
return *s1 - *s2;
}
/* 频次高者在前,同频次的话,字典序小的在前 */
int CompareVocabTie(const void *a, const void *b) {
long long c;
if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word));
}
/* 仅按凭此排序(不确定排序,不能明确排列同频次word) */
int CompareVocab(const void *a, const void *b) {
long long c;
if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
else return 0;
}
返回单词的哈希值
/* Simple bitwise hash function */
unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
char c;
unsigned int h;
h = seed;
for ( ; (c = *word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
return (unsigned int)((h & 0x7fffffff) % tsize);
}
初始化hash表
HASHREC **ht实现了dict的功能,能通过单词的哈希值快速的检索到单词。它是一个链表的数组,数组的大小是TSIZE,是哈希值的上限。要在这个链表数组中查找到想要的单词,首先要计算单词的哈希值,比如是15,然后再ht[15]这个链表中找到单词。
/* Create hash table, initialise pointers to NULL */
HASHREC ** inithashtable() {
int i;
HASHREC **ht;
ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL; //这里的指空需要留意
return ht;
}
hash表的插入算法
/* Search hash table for given string, insert if not found */
void hashinsert(HASHREC **ht, char *w) {
HASHREC *htmp, *hprv;
unsigned int hval = HASHFN(w, TSIZE, SEED);
for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
//上面这个循环的目的是找到合适的插入位置(htmp),找到的标志是htmp只到列表尾部或是找到word值相同的位置
// hpre记录htmp的前一个位置,方便列表的插入
if (htmp == NULL) {
htmp = (HASHREC *) malloc( sizeof(HASHREC) );
htmp->word = (char *) malloc( strlen(w) + 1 );
strcpy(htmp->word, w);
htmp->count = 1;
htmp->next = NULL;
if ( hprv==NULL )
ht[hval] = htmp;
else
hprv->next = htmp;
}
else {
/* new records are not moved to front */
htmp->count++;
if (hprv != NULL) { //这是一个提速的小trick,将最近插入的单词提到列表头(思想大概是相同的词可能近期连续出现)
/* move to front on access */
hprv->next = htmp->next;
htmp->next = ht[hval];
ht[hval] = htmp;
}
}
return;
}
下面的函数实现了从文件中依次读取单个word
int get_word(char *word, FILE *fin) {
int i = 0, ch;
for ( ; ; ) {
ch = fgetc(fin);
if (ch == '\r') continue;
if (i == 0 && ((ch == '\n') || (ch == EOF))) {
word[i] = 0;
return 1;
}
if (i == 0 && ((ch == ' ') || (ch == '\t'))) continue; // skip leading space
if ((ch == EOF) || (ch == ' ') || (ch == '\t') || (ch == '\n')) {
if (ch == '\n') ungetc(ch, fin); // 把已经读出来的'\n'压回输入流中
break;
}
if (i < MAX_STRING_LENGTH - 1)
word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH
}
word[i] = 0; //null terminate
// avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0])
// see https://en.wikipedia.org/wiki/UTF-8#Description
if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) {
if ((word[i-1] & 0xC0) == 0xC0) {
word[i-1] = '\0';
} else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) {
word[i-2] = '\0';
} else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) {
word[i-3] = '\0';
}
}
return 0;
}
主要处理函数,对形成的hash表中的键值对进行排序并输出到文件
int get_counts() {
long long i = 0, j = 0, vocab_size = 12500;
// char format[20];
char str[MAX_STRING_LENGTH + 1];
HASHREC **vocab_hash = inithashtable();
HASHREC *htmp;
VOCAB *vocab;
FILE *fid = stdin; // 从标准文件读,因为程序运行采用命令行模式,并将文件重定向到标准输入
fprintf(stderr, "BUILDING VOCABULARY\n");
if (verbose > 1) fprintf(stderr, "Processed %lld tokens.", i);
while ( ! feof(fid)) {
// Insert all tokens into hashtable
int nl = get_word(str, fid);
if (nl) continue; // just a newline marker or feof
if (strcmp(str, "<unk>") == 0) {
fprintf(stderr, "\nError, <unk> vector found in corpus.\nPlease remove <unk>s from your corpus (e.g. cat text8 | sed -e 's/<unk>/<raw_unk>/g' > text8.new)");
return 1;
}
hashinsert(vocab_hash, str);
if (((++i)%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i);
}
if (verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i);
vocab = malloc(sizeof(VOCAB) * vocab_size); //生成用于排序的数组
for (i = 0; i < TSIZE; i++) { // Migrate vocab to array
htmp = vocab_hash[i];
while (htmp != NULL) {
vocab[j].word = htmp->word;
vocab[j].count = htmp->count;
j++;
if (j>=vocab_size) {
vocab_size += 2500;
vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size); //空间不够时,对数组扩充
}
htmp = htmp->next;
}
}
if (verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j);
if (max_vocab > 0 && max_vocab < j) //实际的词汇量超过最大规定的词汇量的时,先对所有词汇按频次排序,再用max_vocab去截断后,再按字典序排列
qsort(vocab, j, sizeof(VOCAB), CompareVocab);
else max_vocab = j;
qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically
for (i = 0; i < max_vocab; i++) {
if (vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary
if (verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count);
break;
}
printf("%s %lld\n",vocab[i].word,vocab[i].count);
}
if (i == max_vocab && max_vocab < j) if (verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab);
fprintf(stderr, "Using vocabulary of size %lld.\n\n", i);
return 0;
}
int find_arg(char *str, int argc, char **argv) { //获取关键子参数在命令行中的位置
int i;
for (i = 1; i < argc; i++) {
if (!scmp(str, argv[i])) {
if (i == argc - 1) { // 如果关键子参数后面没有指定值则报错
printf("No argument given for %s\n", str);
exit(1);
}
return i;
}
}
return -1;
}
int main(int argc, char **argv) {
int i;
if (argc == 1) {
printf("Simple tool to extract unigram counts\n");
printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
printf("Usage options:\n");
printf("\t-verbose <int>\n");
printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
printf("\t-max-vocab <int>\n");
printf("\t\tUpper bound on vocabulary size, i.e. keep the <int> most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\n");
printf("\t-min-count <int>\n");
printf("\t\tLower limit such that words which occur fewer than <int> times are discarded.\n");
printf("\nExample usage:\n");
printf("./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\n");
return 0;
}
if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
if ((i = find_arg((char *)"-max-vocab", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]);
if ((i = find_arg((char *)"-min-count", argc, argv)) > 0) min_count = atoll(argv[i + 1]);
return get_counts();
}
3. 使用
./vocab_count -min-count 5 -verbose 2 < test.txt > vocab.txt
- -min-count : 字符出现的最小次数
- -verbose 2 :冗余输出(会在程序执行过程中输出一些辅助信息,以判断程序执行的进度)
- -max-vocab :这里没有指定,默认0,代表不限制词汇量
- < : 是将text文件重定向到给程序的标准输入
- > : 将程序汇总的print重定向的输出文件
4. 阅读收获
- 见识到了C程序获取命令行参数的一种新的方法
- 对hash表的冲突解决有了更深的认识
- 使用max-vocab参数时,截断排序的方法还是很巧妙的
- 对重定向的使用
5. 改进
因为这里最后生成的仅仅是一个已经排序好的字典文件,一是单线程跑速度太慢,能否将其改为多线程跑。或者是使用map-reduce的思想对统计进行改进,然后再使用类似于归并排序的方法进行排序输出。