词向量源码解析：（3.5）GloVe源码解析之glove

最新推荐文章于 2024-09-14 07:09:25 发布

Sailing_ZhaoZhe

最新推荐文章于 2024-09-14 07:09:25 发布

阅读量2k

点赞数 3

分类专栏：词向量

本文链接：https://blog.csdn.net/u011793737/article/details/77941983

版权

本文详细解析了GloVe的源码实现，特别是训练部分，介绍了词向量和上下文向量的训练参数、初始化、多线程训练过程，以及采用的adagrad优化算法。

摘要由CSDN通过智能技术生成

和绝大多数的词向量不同，glove的目标是通过训练词向量和上下文向量，使得它们能够重构共现矩阵。glove训练部分的代码风格和word2vec中训练部分的代码风格如出一辙。有了之前看word2vec的基础，很容易就能看懂glove是怎么做的了。glove在三元组上面进行训练。三元组的数据结构依然和原来一样。

typedef struct cooccur_rec {
int word1;
int word2;
real val;
} CREC;

我们首先看看glove中要训练的参数，看initialize_parameters可以发现，跟word2vec中的参数基本一样，都是一份词向量参数和一份上下文向量参数。维度上glove多了一个bias，所以有一点区别，下面是代码。

void initialize_parameters() {
long long a, b;
vector_size++; // Temporarily increment to allocate space for bias//词向量多一维给bias

/* Allocate space for word vectors and context word vectors, and correspodning gradsq */
a = posix_memalign((void **)&W, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//词向量和上下文向量参数，这个乘2把两部分都包括了。这里vector_size加1,写的可能有问题，应该不需要加1
if (W == NULL) {
fprintf(stderr, "Error allocating memory for W\n");
exit(1);
}
a = posix_memalign((void **)&gradsq, 128, 2 * vocab_size * (vector_size + 1) * sizeof(real)); // Might perform better than malloc//glove用adagrad做梯度下降，还要为每个参数存梯度累积值。
if (gradsq == NULL) {
fprintf(stderr, "Error allocating memory for gradsq\n");
exit(1);
}
for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) W[a * vector_size + b] = (rand() / (real)RAND_MAX - 0.5) / vector_size;//初始化参数
for (b = 0; b < vector_size; b++) for (a = 0; a < 2 * vocab_size; a++) gradsq[a * vector_size + b] = 1.0; // So initial value of eta is equal to initial learning rate
vector_size--;
}

下面是train_glove函数，开启多线程调用glove_thread函数去训练，跟word2vec套路一样

/* Train model */
int train_glove() {
long long a, file_size;
int save_params_return_code;
int b;
FILE *fin;
real total_cost = 0;

fprintf(stderr, "TRAINING MODEL\n");

fin = fopen(input_file, "rb");//打开被打乱的三元组文件
if (fin == NULL) {fprintf(stderr,"Unable to open cooccurrence file %s.\n",input_file); return 1;}
fseeko(fin, 0, SEEK_END);
file_size = ftello(fin);
num_lines = file_size/(sizeof(CREC)); // Assuming the file isn't corrupt and consists only of CREC's//一共有多少个三元组
fclose(fin);
fprintf(stderr,"Read %lld lines.\n", num_lines);
if (verbose > 1) fprintf(stderr,"Initializing parameters...");
initialize_parameters();
if (verbose > 1) fprintf(stderr,"done.\n");
if (verbose > 0) fprintf(stderr,"vector size: %d\n", vector_size);
if (verbose > 0) fprintf(stderr,"vocab size: %lld\n", vocab_size);
if (verbose > 0) fprintf(stderr,"x_max: %lf\n", x_max);
if (verbose > 0) fprintf(stderr,"alpha: %lf\n", alpha);
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));//多线程
lines_per_thread = (long long *) malloc(num_threads * sizeof(long long));//每个线程处理的三元组个数

time_t rawtime;
struct tm *info;