comments of word2vec code

最新推荐文章于 2024-02-06 12:37:24 发布

Danni_hgc

最新推荐文章于 2024-02-06 12:37:24 发布

阅读量365

点赞数

分类专栏：机器学习

机器学习专栏收录该内容

3 篇文章 1 订阅

订阅专栏

// Create binary Huffman tree using the word counts  
// Frequent words will have short unique binary codes  
void CreateBinaryTree() {  
  // min1i is the first son, and min2i is the second son; point[] is used to store the index of all father nodes  
  long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];  
  char code[MAX_CODE_LENGTH];  //code is 0-1 sequence  
  long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));  
  long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));  
  long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));  
  //initialize count[], left part stores the occur times of words, and the right part is initialized by infinite  
  for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;  
  for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;  
  // pos1 scans from the end of the left part of "count[]" to its beginning, while pos2 scans from the beginning of the second part of "count[]"  
  pos1 = vocab_size - 1;  
  pos2 = vocab_size;  
  // Following algorithm constructs the Huffman tree by adding one node at a time  
  //following a step within the second part of the count vector  
  //in following for loop, "a" is a pointer scanning the second part of "count[]", because the non-leaf nodes is "one" less than the leaf nodes, hence "a" only needs to   
  //scan "vocab_size-1" entries.  
  for (a = 0; a < vocab_size - 1; a++) {  
    // next, find two smallest nodes 'min1i, min2i'. First, find the min1i  
    if (pos1 >= 0) //if pos1 has not passed the left border of "count[]"  
    {  
      if (count[pos1] < count[pos2]) {  
        min1i = pos1;  
        pos1--;  
      } else {  
        min1i = pos2;  
        pos2++;  
      }  
    } else {  
      min1i = pos2;  
      pos2++;  
    }  
    //second,  find the min2i  
    if (pos1 >= 0)  
    {  
      if (count[pos1] < count[pos2]) {  
        min2i = pos1;  
        pos1--;  
      } else {  
        min2i = pos2;  
        pos2++;  
      }  
    } else {  
      min2i = pos2;  
      pos2++;  
    }  
    //already found the two sons, add their counts as their father's count  
    count[vocab_size + a] = count[min1i] + count[min2i];  
    //record their father's position in "count[]"  
    parent_node[min1i] = vocab_size + a;  
    parent_node[min2i] = vocab_size + a;  
    // let the code choosing the second son is "1", the first son is naturally "0" for the initialization of "binary[]"  
    binary[min2i] = 1;  
  }  
  // Now assign binary code to each vocabulary word "a"  
  for (a = 0; a < vocab_size; a++)  
  {  
    b = a; // "b" is used to find its father, starting from itself  
    i = 0; // "i" is for counting the number of its ancestors, namely the length of its code  
    //find all ancestors by down-up style  
    while (1)  
    {  
      code[i] = binary[b];  
      point[i] = b;//point[] stores the index of its ancestors from itself to the highest (not consider the root)  
      i++;  
      b = parent_node[b];  
      //if "b" meets the root, over. because there are only vocab_size-1 non-leaf nodes, hence the index of root is vocab_size * 2 - 2 in "count[]"  
      if (b == vocab_size * 2 - 2) break;  
    }//while outputs the temporary code[] and point[], both are down-up style  
    vocab[a].codelen = i; //i is the length of code  
    //next, convert above temporary code[] and point[] inversely. First, let the first ancestor be the root (index is vocab_size-2)  
    vocab[a].point[0] = vocab_size - 2;  
    for (b = 0; b < i; b++)  
    {  
      vocab[a].code[i - b - 1] = code[b];  
      //note that point[0] is the index of the word itself, so "point[0] - vocab_size" is surely a negative number. Putting this negative number into the         
      // "vocab[a].point[i]" (where "i" is the code length) as a flag.  
      vocab[a].point[i - b] = point[b] - vocab_size;   
    }  
  }  
  free(count);  
  free(binary);  
  free(parent_node);  
}

Training process:

//train the cbow architecture
if (cbow) {  
      // in -> hidden, 
      //namely sum all the embedding of context words by element-wise style, produce the information of hidden layer "neul[]"
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window)
      {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];//here, last_word should be a context word
        if (last_word == -1) continue;
        //sum the context words' embeddings
        for (c = 0; c < layer1_size; c++)
            neu1[c] += syn0[c + last_word * layer1_size];
      }
      //in hierarchical softmax, each target word is replaced by its ancestors for computation. Namely, use each father's feature vector to 
      //compute with the sum of context words. Father's feature vector is also the weight vector from hidden layer to output layer.
      if (hs) for (d = 0; d < vocab[word].codelen; d++)// for each ancestors
      {
        f = 0;
        //vocab[word].point[d] is the index of a father in the synl matrix
        l2 = vocab[word].point[d] * layer1_size;  //l2 is the beginning position of that father's feature vector


        // Propagate hidden -> output,    
        //use the fathers' information "syn1[c + l2]" to compute with the sum of context words' embedding "neu1[c]"
        for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
        if (f <= -MAX_EXP) continue;
        else if (f >= MAX_EXP) continue;
        else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
        // convert "f" to "g". 'g' is the gradient multiplied by the learning rate
        g = (1 - vocab[word].code[d] - f) * alpha;


        // Propagate errors output -> hidden
        for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
        // Learn weights hidden -> output, weights is also the father's feature vector
        for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
      }
      // NEGATIVE SAMPLING 取负例
      if (negative > 0) for (d = 0; d < negative + 1; d++) {//一个正例,negative个负例
        if (d == 0) {//正例为自己
          target = word;
          label = 1;
        } else {
          next_random = next_random * (unsigned long long)25214903917 + 11;
          target = table[(next_random >> 16) % table_size];
          if (target == 0) target = next_random % (vocab_size - 1) + 1;
          if (target == word) continue;
          label = 0;
        }
        l2 = target * layer1_size;
        f = 0;
        for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
        if (f > MAX_EXP) g = (label - 1) * alpha;
        else if (f < -MAX_EXP) g = (label - 0) * alpha;
        else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
        for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
        for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
      }
      // hidden -> in
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        //update the embedding of each context word
        for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
      }