"""
原始 embedding,添加 position-embedding,
multihead-attention: embedding->ff(q,k,v)-->split+concat-->mask--> attention-->scaled-dot-product-->softmax(qk)--> softmax(qk)*value-->residual,input+output-->layer-normalization
feadworad: fc+relu+fc -->residual,input+output-->layer-normalization
应用: ff组合,
residual+ln,进行layer 归一化
dot-product 后的 根据维度 scale
qkv 是 attention, qkv 相同,则是 self-attention
qk-mask,得到的是二维矩阵,是token【i】和token{0,1,2,...k}的关系映射
mask掉 pad-token, 使得最后 attention*embedding得到的 token-embedding 不受影响
softmax ,特别小的数,使得不受pad部分影响
"""
def load_vocab(vocab_fpath):
'''Loads vocabulary file and returns idx<->token maps
vocab_fpath: string. vocabulary file path.
Note that these are reserved
0: <pad>, 1: <unk>, 2: <s>, 3: </s>
Returns
two dictionaries.
'''
vocab = [line.split()[0] for line in open(vocab_fpath, 'r').read().splitlines()]
token2idx = {token: idx for idx, token in enumerate(vocab)}
idx2token = {idx: token for idx, token in enumerate(vocab)}
return token2idx, idx2token
import tensorflow as tf
import numpy as np
def get_token_embeddings(vocab_size, num_units, zero_pad=True):
'''Constr