from pathlib import Path
这样定义的WindowsPath对象可以直接用/拼接路径字符串
eg:main_path = Path(args.main_path)
model_path = main_path / args.model_path
定义了一种继承了torchtext的类型DataField
从而有对象TRG和SRC
并定义类型GraphField,有对象GRA
再有Dataset类对象train_data,path='train.bpe'
构造一个元组列表
newfields = [('src', SRC), ('trg', TRG), ('extra_0', GRA)]
对应这三个东西的路径,分别是en是英文文本,de是其他语言文本,graph是类似“1000092795.jpg 0-0 1-0 2-0 3-0 4-0 8-1 9-1”这样
然后分别取这样的对应的每一行,用Example.fromlist整理
从而在数据集中有29000个example对象
然后
alldata = train_data.examples
train_data.examples = list(filter(lambda ex: len(ex.src) <= args.max_len and
len(ex.trg) <= args.max_len, alldata))
再有Dataset类对象dev_data,path='val.bpe',同样有三元组
接着SRC.build_vocab,TRG.build_vocab做了一下词库并保存
有data.BucketIterator对象train_real(又叫train_iter),用来迭代train_data
data.Iterator对象dev_real(又叫dev),用来迭代dev_data
主程序
输入:
args
alpha=0.6
batch_size=2000
beam_size=1
boxfeat=('bpe_data/train.res.pkl', 'bpe_data/val.res.pkl')
boxprobs='bpe_data/boxporbs.pkl'
corpus_prex='bpe_data/train.bpe'
d_hidden=256
d_model=128
dec_dp=0.5
decoding_path='decoding'
delay=1
drop_ratio=0.1
enc_dp=0.5
eval_every=2000
grad_clip=-1.0
img_dp=0.5
initnn='standard'
input_drop_ratio=0.5
lang=('en', 'de', 'graph')
length_ratio=2
load_from=None
load_vocab=False
lr=1.0
lrdecay=False
main_path='./'
max_len=100
maximum_steps=80000
mode='train'
model=''
model_path='models'
n_enclayers=3
n_heads=4
n_layers=4
objdim=2048
optimizer='Noam'
params='user'
patience=0
pool=100
ref='bpe_data/val.lc.norm.tok.de'
resume=False
save_every=5000
seed=1234
share_embed=False
share_vocab=False
smoothing=0.1
src_vocab=8506
test=None
trg_vocab=9392
valid='bpe_data/val.bpe'
vocab='bpe_data/'
vocab_size=40000
warmup=4000
writetrans='decoding/modelname.devtrans'
train_iter torchtext的一种iterator
dev torchtext的一种iterator
src NormalField对象
tgt NormalField对象
checkpoint None
best_bleu = 0.0
best_iter = 0
offset = 0
srcpadid = 1
tgtpadid = 1
设置标签平滑
smoothing_value = smoothing / (args.trg_vocab - 2)
# 1.0649627263045795e-05
one_hot = torch.full((args.trg_vocab,), smoothing_value)
one_hot[tgtpadid] = 0
one_hot = one_hot.unsqueeze(0).cuda()
读取train.res.pkl有allboxfeats,是个29000长的字典,key是图片文件名。value是[**, 2048]的numpy数组(第一维应该就是图像中bounding box的个数,也就是视觉节点的个数,2048应该就是每个节点对应的2048维特征)
读取val.res.pkl有valboxfeats,是个1014长的字典,结构一样,这里面的图像独立于allboxfeats
读取boxporbs.pkl有boxprobs,是个32014长的字典,不过这个的value是个若干长的list(可能这个列表长对应于这个图像有几个节点吧,但是列表的元素的若干长的0.0x的小数不知道什么意思)
topk = 5
thre = 0.0
objdim = args.objdim # 2048
Adam (
Parameter Group 0
amsgrad: False
betas: (0.9, 0.98)
eps: 1e-09
lr: 0.001
weight_decay: 0
)
遍历train_iter取每个batch(看设置,在循环内部iters加上offset,当前offset是0倒没啥影响,另外这个iter的遍历是一直持续到maxsteps)
此时在迭代器规则下构造minibatch,现有设置的batchsize*100,即200000,data有29000个没问题
将数据分成大小为100*batch_size的块,使用sort_key对每个块中的示例进行排序,然后对这些示例进行批处理并对批处理进行shuffle。
batch_size_fn=lambda new, count, sofar: count
(看不懂。。。总之这个136应该就是minibatch后得到的batchsize)
通过torchtext中的规则迭代到GRA.process这个函数,此时的输入是136个类似'1910013808.jpg 1-0 4-1 5-1 9-2 12-3'
这样的东西组成的列表(前面是图像名,后面应该是节点对)
,(话说这个136到底是啥,batchsize明明是2000)
取出文件名后剩下的节点对整理成[**,2]形式的二维list(其中**是节点对数)
最终返回3个list:
batch_imgs:存储这些所有图像名
batch_alighs:存储所有图像的所有节点对,三层list
region_num:存储值对应batch_alighs中图像的最后一对节点的后一个节点(很神秘的处理?)
例如:[[0, 0], [1, 0], [2, 0], [3, 0], [5, 1], [6, 1], [7, 1], [11, 2], [12, 2]]对应的是3
最后从数据集中能取出来的东西train_batch如下
[torchtext.data.batch.Batch of size 136]
[.src]:[torch.cuda.LongTensor of size 136x15 (GPU 0)]
#[136, 15]
[.trg]:[torch.cuda.LongTensor of size 136x13 (GPU 0)]
#[136, 13]
[.extra_0]:("['1910013808.jpg', '6216794312.jpg', ..., '275173204.jpg']", '[[[1, 0], [4, 1], ..., [10, 3]]]', '[4, 4, ..., 4]')
#136个图像名,136个若干不等量组节点对(应该是对应136个图),136个大概2~5的数字(也就是上面的region_num)
prepare_sources(train_batch.src, srcpadid, args.share_vocab)
输入:
train_batch.src data [136, 15]
srcpadid padid 1
args.share_vocab share_vocab False # 默认是False
如果share_vocab为True,就把data的第一列去掉
通过比较data和padid得到的bool张量构造得到masks[136, 1, 15]
输出:
sources data [136, 15]
source_masks masks
prepare_targets(train_batch.trg, tgtpadid)
输入:
train_batch.trg [136, 13]
tgtpadid
去掉data最后一列得到tgt_input[136,12]
对tgt_input和tgtpadid做类似masks的做法得到tgt_input_mask[136,1,12] # 不过这次进一步处理成了01张量
make_subsequent_mask(tgt_input_mask.size(2))#12
构造个shape(12,12)的全1矩阵再转下三角并unsqueeze成[1, 12, 12]subsequent_mask
tgt_input_mask = tgt_input_mask & subsequent_mask
[136, 12, 12]
去掉data第一列得到tgt_output[136, 12]
n_tokens = (tgt_output != tgtpadid).detach().sum()
输出:
target_inputs tgt_input
target_outputs tgt_output
target_ipmasks tgt_input_mask
n_tokens =1590
imgs, aligns, regions_num = train_batch.extra_0
imgs 136长的list,内容是图像名
aligns [136,**]的list,内容0~10
regions_num 136长的list
# B Tobj
obj_feat = sources.new_zeros(sources.size(0), max(regions_num), topk, objdim).float()
# [136, 5, 5, 2048]
# B 1 Tobj*topk
obj_mask = source_masks.new_zeros(sources.size(0), max(regions_num)*topk)
# [136, 25]
# B Tx Tobj*topk
matrix = sources.new_zeros(sources.size(0), sources.size(1), max(regions_num)*topk).float()
# [136, 15, 25]
遍历imgs,对每个图像img#当前图像的索引ib
# phrase_num, 5, 2048 (numpy)
boxfeat = torch.tensor(allboxfeats[img]).reshape(-1, 5, objdim)
# [4, 5, 2048]
# phrase_num * 5
img_boxprobs = torch.tensor(boxprobs[img])
# 长20的tensor
ge_thre = (img_boxprobs >= thre).byte()
# keep top 1
ge_thre[list(range(0, ge_thre.size(0), 5))] = 1
obj_mask[ib, :ge_thre.size(0)] = ge_thre
obj_feat[ib, :boxfeat.size(0)] = boxfeat[:, :topk]
for item in aligns[ib]:#遍历当前ib图像对应的5维list
## item: text_word_id, object_id
objixs = sources.new_tensor([n+item[1] * topk for n in range(topk)])
matrix[ib, item[0], objixs] = ge_thre[objixs].float().cuda()
# batch_size, objectnum, objdim
obj_feat = obj_feat.view(sources.size(0), -1, objdim)
#[136, 25, 2048]
obj_mask = obj_mask.unsqueeze(1)
#[136, 1, 25]
输入:
src sources [136, 15]
tgt target_inputs [136, 12]
src_mask source_masks [136,1,15]
tgt_mask target_ipmasks [136,12,12]
obj_feat [136,25,2048]
None
obj_mask [136,1,25]
matrix [136,15,25]
EncoderDecoder(
(encoder): GATEncoder(
输入:
src [136, 15]
src_mask[136,1,15]
obj_feat [136,25,2048]
None
obj_mask [136,1,25]
matrix [136,15,25]
src先如下处理
Sequential(
(0): Embeddings(
即m=nn.Embedding(8506, 128),且让这个嵌入变换的参数初始化为均值为0,方差为1/128
tmp=m(src)*sqrt(128)
)
(1): PositionalEncoding(
构造一个pe#[1, 5000, 128]
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# compute once in log space
div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float)
* -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
tmp=tmp+self.pe[:, :x.size(1)]
(dropout): Dropout(p=0.5, inplace=False)
x=dropout(src)#[136, 15, 128]
)
)
(trans_obj): Sequential(输入:obj_feats
(0): Linear(in_features=2048, out_features=128, bias=True)
(1): ReLU()
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=128, out_features=128, bias=True)
(4): ReLU()
(5): Dropout(p=0.5, inplace=False)
输出o#[136,25,128]
)
matrix先放缩后转置得到matrix4obj[136, 25, 15, 1]
batch=136, objn=25, xn=15
for i in range(2):
(mhatt_x): ModuleList(输入:3个x(作为q,k,v)和mask
(i): MultiHeadedAttention(#其实每层都是这个样子
Linear(in_features=128, out_features=128, bias=True)
q,k,v都经过这个线性变换后再view再转置得到q,k,v#[136, 4, 15, 32]
从而经过多头注意力公式得到x#[136, 4, 15, 32]
并转置并view成#[136, 15, 128]
最后再经过一个上面的线性变换输出x_trans
)
)
(res4mes_x): ModuleList(输入:mhatt_x的输入x和x_trans
(i): SublayerConnectionv2(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
newx=self.norm(x + self.dropout(x_trans))
)
)
(mhatt_o): ModuleList(输入:o[136, 25, 128]
#类似的处理
(i): MultiHeadedAttention(
Linear(in_features=128, out_features=128, bias=True)
)
)
(res4mes_o): ModuleList(
(i): SublayerConnectionv2(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
)输出newo[136, 25, 128]
然后unsqueeze出newx_ep[136, 15, 25, 128],o_ep[136, 15, 25, 128],x_ep[136, 25, 15, 128],newo_ep[136, 25, 15, 128]
拼接后是[136, 15, 25, 256]输入下面的mhatt_x2o后sigmoid
(mhatt_x2o): ModuleList(
(i): Linear(in_features=256, out_features=128, bias=True)
)输出x2o_gates[136, 15, 25, 128]
x2o = (x2o_gates * matrix * o_ep).sum(2)#[136, 15, 128]
(mhatt_o2x): ModuleList(
(i): Linear(in_features=256, out_features=128, bias=True)
)
o2x = (o2x_gates * matrix4obj * x_ep).sum(2)#[136, 25, 128]
(xgate): ModuleList(输入newx, x2o
(i): SublayerConnectionv2(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
)输出newx[136, 15, 128]
(ogate): ModuleList(输入newo, o2x
(i): SublayerConnectionv2(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
)输出newo[136, 25, 128]
(ffn_x): ModuleList(
(i): PositionwiseFeedForward(
(w_1): Linear(in_features=128, out_features=256, bias=True)
(w_2): Linear(in_features=256, out_features=128, bias=True)
tmp=w_2(relu(w_1(newx))
)
)
(res4ffn_x): ModuleList(
(i): SublayerConnectionv2(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
x=norm(newx + dropout(tmp))
)
)
(ffn_o): ModuleList(
(i): PositionwiseFeedForward(
(w_1): Linear(in_features=128, out_features=256, bias=True)
(w_2): Linear(in_features=256, out_features=128, bias=True)
)
)
(res4ffn_o): ModuleList(
(i): SublayerConnectionv2(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
)输出hx, ho
)
(decoder): Decoder(
输入
hx
src_mask
tgt
ho
obj_mask
tgt_mask
tgt先如下处理
Sequential(
(0): Embeddings(
即m=nn.Embedding(9392, 128),且让这个嵌入变换的参数初始化为均值为0,方差为1/128
tmp=m(tgt)*sqrt(128)
)
(1): PositionalEncoding(
构造一个pe#[1, 5000, 128]
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# compute once in log space
div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float)
* -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
tmp=tmp+self.pe[:, :x.size(1)]
(dropout): Dropout(p=0.5, inplace=False)
y=dropout(tgt)#[136, 12, 128]
)
)
(layers): ModuleList(输入y, hx, src_mask, tgt_mask
for i in range(4)
(i): DecoderLayer(
(self_attn): MultiHeadedAttention(输入三个y,和tgt_mask
(linears): ModuleList(
(0): Linear(in_features=128, out_features=128, bias=True)
(1): Linear(in_features=128, out_features=128, bias=True)
(2): Linear(in_features=128, out_features=128, bias=True)
(3): Linear(in_features=128, out_features=128, bias=True)
)
)#输出tmp[136, 12, 128]
(sublayer): ModuleList(
(0): SublayerConnection(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
y=norm(y + dropout(tmp))#[136, 12, 128]
)
(src_attn): MultiHeadedAttention(输入y, hx, hx, src_mask
(linears): ModuleList(
(0): Linear(in_features=128, out_features=128, bias=True)
(1): Linear(in_features=128, out_features=128, bias=True)
(2): Linear(in_features=128, out_features=128, bias=True)
(3): Linear(in_features=128, out_features=128, bias=True)
)
)#输出tmp[136, 12, 128]
(sublayer): ModuleList(
(1): SublayerConnection(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
y=norm(y + dropout(tmp))#[136, 12, 128]
)
(ffn): PositionwiseFeedForward(
(w_1): Linear(in_features=128, out_features=256, bias=True)
(w_2): Linear(in_features=256, out_features=128, bias=True)
tmp=w_2(relu(w_1(y))
)
(sublayer): ModuleList(
(2): SublayerConnection(
(norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.5, inplace=False)
)
y=norm(y + dropout(tmp))#[136, 12, 128]
)
)
)
)
(generator): Generator(输入y
(proj): Linear(in_features=128, out_features=9392, bias=False)
log_softmax
)#输出[136, 12, 9392]
)
把前面
smoothing_value = smoothing / (args.trg_vocab - 2)
one_hot= torch.full((args.trg_vocab,), smoothing_value)
one_hot[tgtpadid] = 0
这样构造得到的one_hot[1, 9392]repeat成[136, 12, 9392]再通过.scatter_和masked_fill_得到truth_p
将其与outputs做KLDivLoss()损失