word2vec核心代码注释

最新推荐文章于 2022-10-17 10:58:09 发布

beck_zhou

最新推荐文章于 2022-10-17 10:58:09 发布

阅读量6.6k

点赞数 1

分类专栏：中文分词（分词/人名识别（命名实体识别）/词性标注）

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/zhoubl668/article/details/24319591

版权

建议对照word2vec.c看注释,标红部分为中文注释以及相应代码,added by lijiawei

// Copyright 2013 Google Inc. All Rights Reserved.

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

// http://www.apache.org/licenses/LICENSE-2.0

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#define MAX_STRING 100

#define EXP_TABLE_SIZE 1000

#define MAX_EXP 6

#define MAX_SENTENCE_LENGTH 1000

#define MAX_CODE_LENGTH 40

const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary

typedef float real; // Precision of float numbers

//@brief 输入文件中每个基本词的结构体

//cn,该本词出现的数量

//point,哈弗曼树中，从root节点到该基本词的路径，指针数组,存放的是每个父节点在vocabulary中的索引

//word,基本词字面

//code,该基本词的哈弗曼码

//codelen，该基本词的哈弗曼码的长度

struct vocab_word {

long long cn;

int *point;

char *word, *code, codelen;

};

char train_file[MAX_STRING], output_file[MAX_STRING];

char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];

//@brief 输入文件中每个基本词的结构体数组,即paper中的vocabulary

struct vocab_word *vocab;

int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;

//@brief 该数组存文件中基本词的字面的hash码，和基本词在vocab_word数组中的位置

//其中基本词的字面的hash码作为该数组的下标,

int *vocab_hash;

long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;

long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;

real alpha = 0.025, starting_alpha, sample = 0;

//@brief

//syn0即基本词的input feature vector 矩阵，当然cbow是把上下文的input feature vector相加

//syn1 实际上是文章中的Wx的W矩阵,x即input feature vector 矩阵syn0

//syn1neg 同syn1，用于负采样

//expTable: logistic function的exp(x)表，实现是事先计算好，对词表中的每个词在计算P(v|Wt-1, . . . ,Wt-n+1)=p(x)=exp(x)/(1+exp(x))时

//直接查表，即p(x)=exp(x)/(1+exp(x))=exp(Wx)/(1+exp(Wx))=exp(syn0*syn1)/(1+exp(syn0*syn1))

real *syn0, *syn1, *syn1neg, *expTable;

clock_t start;

int hs = 1, negative = 0;

const int table_size = 1e8;

int *table;

void InitUnigramTable() {

int a, i;

long long train_words_pow = 0;

real d1, power = 0.75;

table = (int *)malloc(table_size * sizeof(int));

for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);

i = 0;

d1 = pow(vocab[i].cn, power) / (real)train_words_pow;

for (a = 0; a < table_size; a++) {

table[a] = i;

if (a / (real)table_size > d1) {

i++;

d1 += pow(vocab[i].cn, power) / (real)train_words_pow;

}

if (i >= vocab_size) i = vocab_size - 1;

}

}

// Reads a single word from a file, assuming space + tab + EOL to be word boundaries

//@brief 此处纯读字面到word字符串中

//@param word 存字面

//@param fin 输入文件流

void ReadWord(char *word, FILE *fin) {

int a = 0, ch;

while (!feof(fin)) {

ch = fgetc(fin);

if (ch == 13) continue;

if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {

if (a > 0) {

if (ch

最低0.47元/天解锁文章

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
5
评论
word2vec核心代码注释

建议对照word2vec.c看注释,标红部分为中文注释以及相应代码,added by lijiawei// Copyright 2013 Google Inc. All Rights Reserved.//// Licensed under the Apache License, Version 2.0 (the "License");// you may
复制链接

扫一扫

专栏目录

评论 5

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。