建议对照word2vec.c看注释,标红部分为中文注释以及相应代码,added by lijiawei
//
//
//
//
//
//
//
//
//
//
//
//
//
#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CODE_LENGTH 40
const int vocab_hash_size = 30000000;
typedef float real;
//@brief 输入文件中每个基本词的结构体
//cn,该本词出现的数量
//point,哈弗曼树中,从root节点到该基本词的路径,指针数组,存放的是每个父节点在vocabulary中的索引
//word,基本词字面
//code,该基本词的哈弗曼码
//codelen,该基本词的哈弗曼码的长度
struct vocab_word {
long long cn;
int *point;
char *word, *code, codelen;
};
char train_file[MAX_STRING], output_file[MAX_STRING];
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
//@brief 输入文件中每个基本词的结构体数组,即paper中的vocabulary
struct vocab_word *vocab;
int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;
//@brief 该数组存文件中基本词的字面的hash码,和基本词在vocab_word数组中的位置
//其中基本词的字面的hash码作为该数组的下标,
int *vocab_hash;
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 0;
//@brief
//syn0即基本词的input feature vector 矩阵,当然cbow是把上下文的input feature vector相加
//syn1 实际上是文章中的Wx的W矩阵,x即input feature vector 矩阵syn0
//syn1neg 同syn1,用于负采样
//expTable: logistic function的exp(x)表,实现是事先计算好,对词表中的每个词在计算P(v|Wt-1, . . . ,Wt-n+1)=p(x)=exp(x)/(1+exp(x))时
//直接查表,即p(x)=exp(x)/(1+exp(x))=exp(Wx)/(1+exp(Wx))=exp(syn0*syn1)/(1+exp(syn0*syn1))
real *syn0, *syn1, *syn1neg, *expTable;
clock_t start;
int hs = 1, negative = 0;
const int table_size = 1e8;
int *table;
void InitUnigramTable() {
int a, i;
long long train_words_pow = 0;
real d1, power = 0.75;
table = (int *)malloc(table_size * sizeof(int));
for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
i = 0;
d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
for (a = 0; a < table_size; a++) {
table[a] = i;
if (a / (real)table_size > d1) {
i++;
d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
}
if (i >= vocab_size) i = vocab_size - 1;
}
}
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
//@brief 此处纯读字面到word字符串中
//@param word 存字面
//@param fin 输入文件流
void ReadWord(char *word, FILE *fin) {
int a = 0, ch;
while (!feof(fin)) {
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
if (a > 0) {
if (ch