// Create binary Huffman tree using the word counts
// Frequent words will have short unique binary codes
void CreateBinaryTree() {
// min1i is the first son, and min2i is the second son; point[] is used to store the index of all father nodes
long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
char code[MAX_CODE_LENGTH]; //code is 0-1 sequence
long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
//initialize count[], left part stores the occur times of words, and the right part is initialized by infinite
for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
// pos1 scans from the end of the left part of "count[]" to its beginning, while pos2 scans from the beginning of the second part of "count[]"
pos1 = vocab_size - 1;
pos2 = vocab_size;
// Following algorithm constructs the Huffman tree by adding one node at a time
//following a step within the second part of the count vector
//in following for loop, "a" is a pointer scanning the second part of "count[]", because the non-leaf nodes is "one" less than the leaf nodes, hence "a" only needs to
//scan "vocab_size-1" entries.
for (a = 0; a < vocab_size - 1; a++) {
// next, find two smallest nodes 'min1i, min2i'. First, find the min1i
if (pos1 >= 0) //if pos1 has not passed the left border of "count[]"
{
if (count[pos1] < count[pos2]) {
min1i = pos1;
pos1--;
} else {
min1i = pos2;
pos2++;
}
} else {
min1i = pos2;
pos2++;
}
//second, find the min2i
if (pos1 >= 0)
{
if (count[pos1] < count[pos2]) {
min2i = pos1;
pos1--;
} else {
min2i = pos2;
pos2++;
}
} else {
min2i = pos2;
pos2++;
}
//already found the two sons, add their counts as their father's count
count[vocab_size + a] = count[min1i] + count[min2i];
//record their father's position in "count[]"
parent_node[min1i] = vocab_size + a;
parent_node[min2i] = vocab_size + a;
// let the code choosing the second son is "1", the first son is naturally "0" for the initialization of "binary[]"
binary[min2i] = 1;
}
// Now assign binary code to each vocabulary word "a"
for (a = 0; a < vocab_size; a++)
{
b = a; // "b" is used to find its father, starting from itself
i = 0; // "i" is for counting the number of its ancestors, namely the length of its code
//find all ancestors by down-up style
while (1)
{
code[i] = binary[b];
point[i] = b;//point[] stores the index of its ancestors from itself to the highest (not consider the root)
i++;
b = parent_node[b];
//if "b" meets the root, over. because there are only vocab_size-1 non-leaf nodes, hence the index of root is vocab_size * 2 - 2 in "count[]"
if (b == vocab_size * 2 - 2) break;
}//while outputs the temporary code[] and point[], both are down-up style
vocab[a].codelen = i; //i is the length of code
//next, convert above temporary code[] and point[] inversely. First, let the first ancestor be the root (index is vocab_size-2)
vocab[a].point[0] = vocab_size - 2;
for (b = 0; b < i; b++)
{
vocab[a].code[i - b - 1] = code[b];
//note that point[0] is the index of the word itself, so "point[0] - vocab_size" is surely a negative number. Putting this negative number into the
// "vocab[a].point[i]" (where "i" is the code length) as a flag.
vocab[a].point[i - b] = point[b] - vocab_size;
}
}
free(count);
free(binary);
free(parent_node);
}
Training process:
//train the cbow architecture
if (cbow) {
// in -> hidden,
//namely sum all the embedding of context words by element-wise style, produce the information of hidden layer "neul[]"
for (a = b; a < window * 2 + 1 - b; a++) if (a != window)
{
c = sentence_position - window + a;
if (c < 0) continue;
if (c >= sentence_length) continue;
last_word = sen[c];//here, last_word should be a context word
if (last_word == -1) continue;
//sum the context words' embeddings
for (c = 0; c < layer1_size; c++)
neu1[c] += syn0[c + last_word * layer1_size];
}
//in hierarchical softmax, each target word is replaced by its ancestors for computation. Namely, use each father's feature vector to
//compute with the sum of context words. Father's feature vector is also the weight vector from hidden layer to output layer.
if (hs) for (d = 0; d < vocab[word].codelen; d++)// for each ancestors
{
f = 0;
//vocab[word].point[d] is the index of a father in the synl matrix
l2 = vocab[word].point[d] * layer1_size; //l2 is the beginning position of that father's feature vector
// Propagate hidden -> output,
//use the fathers' information "syn1[c + l2]" to compute with the sum of context words' embedding "neu1[c]"
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
if (f <= -MAX_EXP) continue;
else if (f >= MAX_EXP) continue;
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
// convert "f" to "g". 'g' is the gradient multiplied by the learning rate
g = (1 - vocab[word].code[d] - f) * alpha;
// Propagate errors output -> hidden
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
// Learn weights hidden -> output, weights is also the father's feature vector
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
}
// NEGATIVE SAMPLING 取负例
if (negative > 0) for (d = 0; d < negative + 1; d++) {//一个正例,negative个负例
if (d == 0) {//正例为自己
target = word;
label = 1;
} else {
next_random = next_random * (unsigned long long)25214903917 + 11;
target = table[(next_random >> 16) % table_size];
if (target == 0) target = next_random % (vocab_size - 1) + 1;
if (target == word) continue;
label = 0;
}
l2 = target * layer1_size;
f = 0;
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
if (f > MAX_EXP) g = (label - 1) * alpha;
else if (f < -MAX_EXP) g = (label - 0) * alpha;
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
}
// hidden -> in
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
c = sentence_position - window + a;
if (c < 0) continue;
if (c >= sentence_length) continue;
last_word = sen[c];
if (last_word == -1) continue;
//update the embedding of each context word
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
}