和绝大多数的词向量不同,glove的目标是通过训练词向量和上下文向量,使得它们能够重构共现矩阵。glove训练部分的代码风格和word2vec中训练部分的代码风格如出一辙。有了之前看word2vec的基础,很容易就能看懂glove是怎么做的了。glove在三元组上面进行训练。三元组的数据结构依然和原来一样。
typedef struct cooccur_rec {
int word1;
int word2;
real val;
} CREC;
我们首先看看glove中要训练的参数,看initialize_parameters可以发现,跟word2vec中的参数基本一样,都是一份词向量参数和一份上下文向量参数。维度上glove多了一个bias,所以有一点区别,下面是代码。
void initialize_parameters() {
long long a, b;
vector_size++; // Temporarily increment to allocate space for bias//词向量多一维给bias
/* Allocate space for word vectors and context word vectors, and correspodning gradsq */
a = posix_memalign((void **)&W, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//词向量和上下文向量参数,这个乘2把两部分都包括了。这里vector_size加1,写的可能有问题,应该不需要加1
if (W == NULL) {
fprintf(stderr, "Error allocating memory for W\n");
exit(1);
}
a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//glove用adagrad做梯度下降,还要为每个参数存梯度累积值。
if (gradsq == NULL) {
fprintf(stderr, "Error allocating memory for gradsq\n");
exit(1);
}
for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;//初始化参数
for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate
vector_size--;
}
下面是train_glove函数,开启多线程调用glove_thread函数去训练,跟word2vec套路一样
/* Train model */
int train_glove() {
long long a, file_size;
int save_params_return_code;
int b;
FILE *fin;
real total_cost = 0;
fprintf(stderr, "TRAINING MODEL\n");
fin = fopen(input_file, "rb");//打开被打乱的三元组文件
if (fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;}
fseeko(fin, 0, SEEK_END);
file_size = ftello(fin);
num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's//一共有多少个三元组
fclose(fin);
fprintf(stderr,"Read %lld lines.\n", num_lines);
if (verbose > 1) fprintf(stderr,"Initializing parameters...");
initialize_parameters();
if (verbose > 1) fprintf(stderr,"done.\n");
if (verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size);
if (verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size);
if (verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max);
if (verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha);
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));//多线程
lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));//每个线程处理的三元组个数
time_t rawtime;
struct tm *info;