word2vec核心代码注释

建议对照word2vec.c看注释,标红部分为中文注释以及相应代码,added by lijiawei

//  Copyright 2013 Google Inc. All Rights Reserved.

//

//  Licensed under the Apache License, Version 2.0 (the "License");

//  you may not use this file except in compliance with the License.

//  You may obtain a copy of the License at

//

//      http://www.apache.org/licenses/LICENSE-2.0

//

//  Unless required by applicable law or agreed to in writing, software

//  distributed under the License is distributed on an "AS IS" BASIS,

//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

//  See the License for the specific language governing permissions and

//  limitations under the License.

 

#define MAX_STRING 100

#define EXP_TABLE_SIZE 1000

#define MAX_EXP 6

#define MAX_SENTENCE_LENGTH 1000

#define MAX_CODE_LENGTH 40

 

const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary

 

typedef float real;                    // Precision of float numbers

 

//@brief 输入文件中每个基本词的结构体

//cn,该本词出现的数量

//point,哈弗曼树中,从root节点到该基本词的路径,指针数组,存放的是每个父节点在vocabulary中的索引

//word,基本词字面

//code,该基本词的哈弗曼码

//codelen,该基本词的哈弗曼码的长度

struct vocab_word {

long long cn;

int *point;

char *word, *code, codelen;

};

 

char train_file[MAX_STRING], output_file[MAX_STRING];

char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];

//@brief 输入文件中每个基本词的结构体数组,paper中的vocabulary

struct vocab_word *vocab;

int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;

//@brief 该数组存文件中基本词的字面的hash码,和基本词在vocab_word数组中的位置

//其中基本词的字面的hash码作为该数组的下标,

int *vocab_hash;

long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;

long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;

real alpha = 0.025, starting_alpha, sample = 0;

//@brief

//syn0即基本词的input feature vector 矩阵,当然cbow是把上下文的input feature vector相加

//syn1 实际上是文章中的WxW矩阵,xinput feature vector 矩阵syn0

//syn1neg syn1,用于负采样

//expTable: logistic functionexp(x)表,实现是事先计算好,对词表中的每个词在计算P(v|Wt-1, . . . ,Wt-n+1)=p(x)=exp(x)/(1+exp(x))

//直接查表,即p(x)=exp(x)/(1+exp(x))=exp(Wx)/(1+exp(Wx))=exp(syn0*syn1)/(1+exp(syn0*syn1))

real *syn0, *syn1, *syn1neg, *expTable;

clock_t start;

 

int hs = 1, negative = 0;

const int table_size = 1e8;

int *table;

 

void InitUnigramTable() {

int a, i;

long long train_words_pow = 0;

real d1, power = 0.75;

table = (int *)malloc(table_size * sizeof(int));

for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);

i = 0;

d1 = pow(vocab[i].cn, power) / (real)train_words_pow;

for (a = 0; a < table_size; a++) {

table[a] = i;

if (a / (real)table_size > d1) {

i++;

d1 += pow(vocab[i].cn, power) / (real)train_words_pow;

}

if (i >= vocab_size) i = vocab_size - 1;

}

}

 

// Reads a single word from a file, assuming space + tab + EOL to be word boundaries

 

//@brief 此处纯读字面到word字符串中

//@param word 存字面

//@param fin 输入文件流

void ReadWord(char *word, FILE *fin) {

int a = 0, ch;

while (!feof(fin)) {

ch = fgetc(fin);

if (ch == 13) continue;

if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {

if (a > 0) {

if (ch

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值