在word2vec代码中只考虑了上下文是单词的情况。已经有大量的论文提出了不同的方式,丰富上下文的信息,从而得到更高质量,或者是性质不同的词向量。word2vecf就是在word2vec上面一个扩展。以前word2vec的输入是语料,现在word2vecf的输入是pair,这样可以支持各种形式的中心词和上下文。比如我们需要引入dependency的信息,我们只需要提供单词文档的pair就好,word2vecf的内容完全不需要修改。word2vecf和word2vec区别不大,我们这里简单的过一遍代码
首先看看scripts文件中的install_word2vecf.sh。下载word2vecf,对C文件进行编译。所有源代码以及编译好的可执行文件都在word2vecf文件夹中。
#!/bin/sh
mkdir word2vecf
wget https://bitbucket.org/yoavgo/word2vecf/get/1b94252a58d4.zip
unzip 1b94252a58d4.zip
rm 1b94252a58d4.zip
mv yoavgo-word2vecf-1b94252a58d4/*.c word2vecf/.
mv yoavgo-word2vecf-1b94252a58d4/*.h word2vecf/.
mv yoavgo-word2vecf-1b94252a58d4/makefile word2vecf/.
rm -r yoavgo-word2vecf-1b94252a58d4
make -C word2vecf
下面看看makefile文件,我们最后就需要word2vecf可执行文件,可以看到它是由三个文件得到了word2vecf.c vocab.c io.c,我们一会儿就解析这三个文件就好。
CC = gcc
#The -Ofast might not work with older versions of gcc; in that case, use -O2
#CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
all: word2vec word2phrase distance word-analogy compute-accuracy word2vecf count_and_filter
count_and_filter: count_and_filter.c vocab.c io.c
$(CC) vocab.c count_and_filter.c io.c -o count_and_filter $(CFLAGS)
word2vec : word2vec.c
$(CC) word2vec.c -o word2vec $(CFLAGS)
word2vecf : word2vecf.c vocab.c io.c
$(CC) word2vecf.c vocab.c io.c -o word2vecf $(CFLAGS)
word2phrase : word2phrase.c
$(CC) word2phrase.c -o word2phrase $(CFLAGS)
distance : distance.c
$(CC) distance.c -o distance $(CFLAGS)
word-analogy : word-analogy.c
$(CC) word-analogy.c -o word-analogy $(CFLAGS)
compute-accuracy : compute-accuracy.c
$(CC) compute-accuracy.c -o compute-accuracy $(CFLAGS)
chmod +x *.sh
clean:
rm -rf word2vec word2phrase distance word-analogy compute-accuracy count_and_filter word2vecf
先看看io.c的内容。io.c就是从文件读取一个单词,word2vec和GloVe都用了类似的代码,区别不大。
void ReadWord(char *word, FILE *fin, int MAX_STRING) {
int a = 0, ch;
while (!feof(fin)) {
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
if (a > 0) break;
else continue;
}
word[a] = ch;
a++;
if (a >= MAX_STRING - 1) a--; // Truncate too long words
}
word[a] = 0;
}
再看看vocab的内容,先看头文件。
struct vocab_word {//我们只会用到单词频数cn以及单词拼写字符串word
long long cn;
int *point;
char *word, *code, codelen;
};
struct vocabulary {//这里把词典的所有内容封装到了结构体中,本质和word2vec中的方法一模一样
struct vocab_word *vocab;//单词数组
int *vocab_hash;//哈希表,用来快速检索单词
long long vocab_max_size; //1000//vocab_word的大小,不够的时候会分配重新内存,大于当前词典的大小,为了提升效率,不会多加一个单词就分配一块内存
long vocab_size;//字典的大小
long long word_count;//语料的大小,或者是单词频数的求和
};
下面是建立词典的函数的声明。可以看到C语言建立词典比python可复杂多了。word2vecf对词典的创建写得更加清晰
int ReadWordIndex(struct vocabulary *v, FILE *fin);
inline int GetWordHash(struct vocabulary *v, char *word);
int SearchVocab(struct vocabulary *v, char *word);
int AddWordToVocab(struct vocabulary *v, char *word);
void SortAndReduceVocab(struct vocabulary *v, int min_count);
struct vocabulary *CreateVocabulary();
void SaveVocab(struct vocabulary *v, char *vocab_file);
struct vocabulary *ReadVocab(char *vocab_file);
void EnsureVocabSize(struct vocabulary *v);
下面在vocab.c中看看这几个函数的实现。GetWordHash返回单词的哈希值。这里就用的是最简单的哈希形式
// Returns hash value of a word
inline int GetWordHash(struct vocabulary *v, char *word) {
unsigned long long hash = 0;
char *b = word;
//for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
//hash = FastHash(word, strlen(word)) % vocab_hash_size;
while (*b != 0) hash = hash * 257 + *(b++);
hash = hash % vocab_hash_size;
return hash;
}
CreateVocabulary创建一个词典。
struct vocabulary *CreateVocabulary() {
struct vocabulary *v = malloc(sizeof(struct vocabulary));
long long a;
v->vocab_max_size = 1000;//一上来vocab数组的大小
v->vocab_size = 0;//一上来没有单词
v->vocab = (struct vocab_word *)calloc(v->vocab_max_size, sizeof(struct vocab_word));//为vocab分配内存
v->vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));//为哈希表分配内存
for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1;
return v;
}
向词典中插入单词
// Adds a word to the vocabulary
int AddWordToVocab(struct vocabulary *v, char *word) {
//static long collide = 0;
//static long nocollide = 0;
unsigned int hash, length = strlen(word) + 1;
if (length > MAX_STRING) length = MAX_STRING;
v->vocab[v->vocab_size].word = (char *)calloc(length, sizeof(char));//插入单词
strcpy(v->vocab[v->vocab_size].word, word);
v->vocab[v->vocab_size].cn = 0;//插入频数
v->vocab_size++;//词典单词数加一
// Reallocate memory if needed
if (v->vocab_size + 2 >= v->vocab_max_size) {//数组大小马上要不够的话重新为vocab分配内存
v->vocab_max_size += 1000;
v->vocab = (struct vocab_word *)realloc(v->vocab, v->vocab_max_size * sizeof(struct vocab_word));
}
hash = GetWordHash(v, word);//下面是更新哈希表
//if (v->vocab_hash[hash] != -1) { collide += 1; } else { nocollide += 1; }
//if ((collide + nocollide) % 100000 == 0) printf("%d %d %f collisions\n\n",collide, nocollide, (float)collide/(collide+nocollide));
while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
v->vocab_hash[hash] = v->vocab_size - 1;
return v->vocab_size - 1;//返回单词在vocab中的位置
}
根据单词查找它在vocab中的位置,或者说查找它的ID。
int SearchVocab(struct vocabulary *v, char *word) {
unsigned int hash = GetWordHash(v, word);//根据哈希值查找
while (1) {
if ((v->vocab_hash)[hash] == -1) return -1;
if (!strcmp(word, v->vocab[v->vocab_hash[hash]].word)) return v->vocab_hash[hash];
hash = (hash + 1) % vocab_hash_size;
}
return -1;
}
对词典进行缩减,去掉低频词,机制和word2vec中一样。
// Reduces the vocabulary by removing infrequent tokens
void ReduceVocab(struct vocabulary *v) {
static int min_reduce = 1;//去掉低频词的阈值
printf("reducevocab\n");
int a, b = 0;
unsigned int hash;
for (a = 0; a < v->vocab_size; a++) if (v->vocab[a].cn > min_reduce) {//遍历一遍词典,去掉低频词
v->vocab[b].cn = v->vocab[a].cn;
v->vocab[b].word = v->vocab[a].word;
b++;
} else free(v->vocab[a].word);
v->vocab_size = b;
for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1;//更新哈希表
for (a = 0; a < v->vocab_size; a++) {
// Hash will be re-computed, as it is not actual
hash = GetWordHash(v, v->vocab[a].word);
while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
v->vocab_hash[hash] = a;
}
fflush(stdout);
min_reduce++;
}
最后再看一个又排序又过滤低频词的代码。
void SortAndReduceVocab(struct vocabulary *v, int min_count) {
int a, size;
unsigned int hash;
// Sort the vocabulary and keep </s> at the first position//先排序
qsort(&(v->vocab[1]), v->vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1;
size = v->vocab_size;
v->word_count = 0;
for (a = 0; a < size; a++) {//遍历词典,删掉低频词,不是低频词的话就更新哈希表
// Words occuring less than min_count times will be discarded from the vocab
if (v->vocab[a].cn < min_count) {
v->vocab_size--;
free(v->vocab[v->vocab_size].word);
} else {
// Hash will be re-computed, as after the sorting it is not actual
hash=GetWordHash(v, v->vocab[a].word);
while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
v->vocab_hash[hash] = a;
v->word_count += v->vocab[a].cn;
}
}
v->vocab = (struct vocab_word *)realloc(v->vocab, (v->vocab_size + 1) * sizeof(struct vocab_word));
}
word2vecf构建词典通过读取词典文件,而不像word2vec那样是扫描语料。
struct vocabulary *ReadVocab(char *vocabfile) {
long long a, i = 0;
char c;
char word[MAX_STRING];
FILE *fin = fopen(vocabfile, "rb");//打开词典文件,词典的格式是每行一个单词后面跟着单词的频数
if (fin == NULL) {
printf("Vocabulary file not found\n");
exit(1);
}
struct vocabulary *v = CreateVocabulary();//建立一个词典
while (1) {//读取所有单词,reduce函数没有用
ReadWord(word, fin, MAX_STRING);//读取一个单词
if (feof(fin)) break;
a = AddWordToVocab(v, word);
fscanf(fin, "%lld%c", &v->vocab[a].cn, &c);//读取单词的频数,c是换行符
i++;
}
SortAndReduceVocab(v, 0);//最后对单词进行排序(按照频数)
printf("Vocab size: %d\n", v->vocab_size);
printf("Word count: %lld\n", v->word_count);
return v;
}
我们最后再看看word2vecf中的内容,下面是TrainModel中的内容,套路和word2vec一样,先建立词典,然后开启多线程训练。这里有两个词典,中心词词典和上下文词典。
long a, b, c, d;
FILE *fo;
FILE *fo2;
file_size = GetFileSize(train_file);
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
printf("Starting training using file %s\n", train_file);
starting_alpha = alpha;
wv = ReadVocab(wvocab_file);//中心词词典
cv = ReadVocab(cvocab_file);//上下文词典
InitNet(wv, cv);
InitUnigramTable(cv);
start = clock();
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
最后再看一下词向量的训练,和word2vec最大的区别是读入的是pair。
void *TrainModelThread(void *id) {
int ctxi = -1, wrdi = -1;
long long d;
long long word_count = 0, last_word_count = 0;
long long l1, l2, c, target, label;
unsigned long long next_random = (unsigned long long)id;
real f, g;
clock_t now;
real *neu1 = (real *)calloc(layer1_size, sizeof(real));
real *neu1e = (real *)calloc(layer1_size, sizeof(real));
FILE *fi = fopen(train_file, "rb");//训练文件使pair
long long start_offset = file_size / (long long)num_threads * (long long)id;
long long end_offset = file_size / (long long)num_threads * (long long)(id+1);
int iter;
//printf("thread %d %lld %lld \n",id, start_offset, end_offset);
for (iter=0; iter < numiters; ++iter) {
fseek(fi, start_offset, SEEK_SET);
// if not binary:
while (fgetc(fi) != '\n') { }; //TODO make sure its ok//由于切分的不是很准,所以跳过一行,保证每行是从头开始读入的
printf("thread %d %lld\n", id, ftell(fi));
long long train_words = wv->word_count;
while (1) { //HERE @@@
// TODO set alpha scheduling based on number of examples read.
// The conceptual change is the move from word_count to pair_count
if (word_count - last_word_count > 10000) {//打印一下基本的信息
word_count_actual += word_count - last_word_count;
last_word_count = word_count;
if ((debug_mode > 1)) {
now=clock();
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
word_count_actual / (real)(numiters*train_words + 1) * 100,
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
fflush(stdout);
}
alpha = starting_alpha * (1 - word_count_actual / (real)(numiters*train_words + 1));
if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
}
if (feof(fi) || ftell(fi) > end_offset) break;
for (c = 0; c < layer1_size; c++) neu1[c] = 0;
for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
wrdi = ReadWordIndex(wv, fi);//读入pair中第一个单词,并得到其id
ctxi = ReadWordIndex(cv, fi);//读入pair中第二个单词,并得到其id
word_count++; //TODO ?
if (wrdi < 0 || ctxi < 0) continue;
if (sample > 0) {//决定是否subsample这个pair
real ran = (sqrt(wv->vocab[wrdi].cn / (sample * wv->word_count)) + 1) * (sample * wv->word_count) / wv->vocab[wrdi].cn;
next_random = next_random * (unsigned long long)25214903917 + 11;
if (ran < (next_random & 0xFFFF) / (real)65536) continue;
ran = (sqrt(cv->vocab[ctxi].cn / (sample * cv->word_count)) + 1) * (sample * cv->word_count) / cv->vocab[ctxi].cn;
next_random = next_random * (unsigned long long)25214903917 + 11;
if (ran < (next_random & 0xFFFF) / (real)65536) continue;
}
//fread(&wrdi, 4, 1, fi);
//fread(&ctxi, 4, 1, fi);
// NEGATIVE SAMPLING//负采样,和word2vec里面完全一样
l1 = wrdi * layer1_size;
for (d = 0; d < negative + 1; d++) {
if (d == 0) {
target = ctxi;
label = 1;
} else {
next_random = next_random * (unsigned long long)25214903917 + 11;
target = unitable[(next_random >> 16) % table_size];
if (target == 0) target = next_random % (cv->vocab_size - 1) + 1;
if (target == ctxi) continue;
label = 0;
}
l2 = target * layer1_size;
f = 0;
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
if (f > MAX_EXP) g = (label - 1) * alpha;
else if (f < -MAX_EXP) g = (label - 0) * alpha;
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
}
// Learn weights input -> hidden
for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
}
}
fclose(fi);
free(neu1);
free(neu1e);
pthread_exit(NULL);
}