import json
def compare_image_ids(json_data1, json_data2):
image_ids1 = extract_image_ids(json_data1)
image_ids2 = extract_image_ids(json_data2)
set_image_ids1 = set(image_ids1)
set_image_ids2 = set(image_ids2)
different_image_ids = set_image_ids1.symmetric_difference(set_image_ids2)
return different_image_ids
def extract_image_ids(json_data):
image_ids = []
# Extract image_ids from images
for image in json_data.get("images", []):
image_id = image.get("id")
if image_id:
image_ids.append(int(image_id))
# Extract image_ids from annotations
for annotation in json_data.get("annotations", []):
image_id = annotation.get("image_id")
if image_id:
image_ids.append(int(image_id))
return image_ids
if __name__ == "__main__":
# Replace these paths with the actual paths to your JSON files
json_file_path1 = "path/to/your/first/file.json"
json_file_path2 = "path/to/your/second/file.json"
with open(json_file_path1, "r") as file:
json_data1 = json.load(file)
with open(json_file_path2, "r") as file:
json_data2 = json.load(file)
different_image_ids = compare_image_ids(json_data1, json_data2)
if different_image_ids:
print("Different image_ids:")
for image_id in different_image_ids:
print(image_id)
else:
print("Image_ids are the same in both JSON files.")
model = build_model(opt)
pretrained_dict = torch.load(pretrained_checkpoint_path, map_location='cpu') # 加载到CPU
model_dict = model.state_dict() # 模型参数.
# 记载特定的权重.
pretrained_dict = {key: value for key, value in pretrained_dict.items() if (key in model_dict and 'det' not in key)}
model_dict.update(pretrained_dict)
missing, unexpected = model.load_state_dict(model_dict, strict=True)
print(f"Loading weights for detector: missing: {len(missing)}, unexpected: {len(unexpected)}.")
import os
import shutil
import numpy as np
import json
# Paths
existing_images_dir = 'path_to_existing_images_directory'
new_images_dir = 'path_to_new_images_directory'
output_images_dir = 'path_to_output_images_directory'
existing_val_ids = np.load('val_ids.npy')
# Load existing data
with open('val.json', 'r') as json_file:
existing_val_data = json.load(json_file)
# Add new data
new_val_ids = []
for i in range(1151, 1251):
# Load caption from file
caption_file_path = f'{i}.txt'
with open(caption_file_path, 'r') as caption_file:
caption = caption_file.read().strip()
# Copy new image to output directory
new_image_path = os.path.join(new_images_dir, f'{i}.jpg')
output_image_path = os.path.join(output_images_dir, f'{i}.jpg')
shutil.copy(new_image_path, output_image_path)
# Add new image to data
new_image_data = {
"file_name": f"{i}.jpg",
"id": len(existing_val_data["images"]) + len(new_val_ids),
"caption": caption
}
existing_val_data["images"].append(new_image_data)
new_val_ids.append(new_image_data["id"])
# Combine old and new IDs
updated_val_ids = np.concatenate((existing_val_ids, new_val_ids))
# Save updated IDs and data
np.save('val_ids.npy', updated_val_ids)
with open('val.json', 'w') as json_file:
json.dump(existing_val_data, json_file, indent=4)
import json
import numpy as np
# 加载之前的150张测试集val.json文件
with open('lzc_data/val.json', 'r') as f:
val_data = json.load(f)
# 加载新的100张测试集test_new.json文件和测试集IDs文件test_new_ids.npy
with open('lzc_data/test_new.json', 'r') as f:
test_new_data = json.load(f)
test_new_ids = np.load("lzc_data/test_new_ids.npy")
# 将新的100张测试集添加到之前测试集的images列表和annotations列表中
val_data['images'].extend(test_new_data['images'])
val_data['annotations'].extend(test_new_data['annotations'])
# 将新的100张测试集的IDs添加到之前测试集的IDs中
val_ids = np.concatenate((val_data['images'], test_new_ids))
# 保存合并后的测试集为merged_val.json文件
with open('lzc_data/merged_val.json', 'w') as f:
json.dump(val_data, f)
# 将合并后的测试集的IDs保存为npy文件
np.save("lzc_data/merged_val_ids.npy", val_ids)
import json
import numpy as np
# 加载之前的150张测试集val.json文件
with open('lzc_data/val.json', 'r') as f:
val_data = json.load(f)
# 加载新的100张测试集test_new.json文件和测试集IDs文件test_new_ids.npy
with open('lzc_data/test_new.json', 'r') as f:
test_new_data = json.load(f)
test_new_ids = np.load("lzc_data/test_new_ids.npy")
# 将新的100张测试集添加到之前测试集的images列表和annotations列表中
val_data['images'].extend(test_new_data['images'])
val_data['annotations'].extend(test_new_data['annotations'])
# 将新的100张测试集的IDs添加到之前测试集的IDs中
val_ids = np.concatenate((val_data['images'], test_new_ids))
# 保存合并后的测试集为merged_val.json文件
with open('lzc_data/merged_val.json', 'w') as f:
json.dump(val_data, f)
# 将合并后的测试集的IDs保存为npy文件
np.save("lzc_data/merged_val_ids.npy", val_ids)
import json
import numpy as np
import random
# 加载训练集的json文件和npy文件
with open('lzc_data/train.json', 'r') as f:
train_data = json.load(f)
# 获取训练集图片的ID列表
train_ids = [image['id'] for image in train_data['images']]
# 从1000张训练集中随机抽取100张作为新的测试集
random_test_indices = random.sample(range(len(train_ids)), 100)
test_data_subset = [train_data['images'][i] for i in random_test_indices]
test_annotations_subset = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in test_data_subset]]
# 将新的测试集保存到文件
with open('lzc_data/test.json', 'w') as f:
json.dump({
'info': train_data['info'],
'license': train_data['license'],
'images': test_data_subset,
'annotations': test_annotations_subset
}, f)
# 保存新的100张测试集的IDs为npy文件
test_ids = [train_ids[i] for i in random_test_indices]
np.save("lzc_data/test_ids.npy", test_ids)
# 剩下的900张训练集
remaining_indices = [i for i in range(len(train_ids)) if i not in random_test_indices]
train_data_remaining = [train_data['images'][i] for i in remaining_indices]
train_annotations_remaining = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in train_data_remaining]]
# 将剩下的900张训练集保存到文件
with open('lzc_data/train.json', 'w') as f:
json.dump({
'info': train_data['info'],
'license': train_data['license'],
'images': train_data_remaining,
'annotations': train_annotations_remaining
}, f)
# 将剩下的900张训练集的IDs保存为npy文件
remaining_train_ids = [train_ids[i] for i in remaining_indices]
np.save("lzc_data/train_ids.npy", remaining_train_ids)
import json
import numpy as np
import random
# 加载训练集的json文件和npy文件
with open('lzc_data/train.json', 'r') as f:
train_data = json.load(f)
# 获取训练集图片的ID列表
train_ids = [image['id'] for image in train_data['images']]
# 从1000张训练集中随机抽取100张作为新的测试集
random_test_indices = random.sample(range(len(train_ids)), 100)
test_data_subset = [train_data['images'][i] for i in random_test_indices]
test_annotations_subset = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in test_data_subset]]
# 将新的测试集保存到文件
with open('lzc_data/test.json', 'w') as f:
json.dump({
'info': train_data['info'],
'license': train_data['license'],
'images': test_data_subset,
'annotations': test_annotations_subset
}, f)
# 保存新的100张测试集的IDs为npy文件
test_ids = [train_ids[i] for i in random_test_indices]
np.save("lzc_data/test_ids.npy", test_ids)
# 剩下的900张训练集
remaining_indices = [i for i in range(len(train_ids)) if i not in random_test_indices]
train_data_remaining = [train_data['images'][i] for i in remaining_indices]
train_annotations_remaining = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in train_data_remaining]]
# 将剩下的900张训练集保存到文件
with open('lzc_data/train.json', 'w') as f:
json.dump({
'info': train_data['info'],
'license': train_data['license'],
'images': train_data_remaining,
'annotations': train_annotations_remaining
}, f)
# 将剩下的900张训练集的IDs保存为npy文件
remaining_train_ids = [train_ids[i] for i in remaining_indices]
np.save("lzc_data/train_ids.npy", remaining_train_ids)
[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:6688 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to ?UNKNOWN? (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
Error executing job with overrides: []
Traceback (most recent call last):
File "/mnt/all/lzc/grit/train_caption.py", line 212, in run_main
mp.spawn(main, nprocs=config.exp.ngpus_per_node, args=(config,))
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/mnt/all/lzc/grit/train_caption.py", line 29, in main
dist.init_process_group('nccl', 'env://', rank=rank, world_size=config.exp.world_size)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 754, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 246, in _env_rendezvous_handler
store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 177, in _create_c10d_store
return TCPStore(
RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:6688 (errno: 98 - Address already in use). The server socket has failed to bind to ?UNKNOWN? (errno: 98 - Address already in use).
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
import os
import shutil
import numpy as np
import json
with open('/mnt/all/dataset/360IC/coco_360_small/annotations/train.json', 'r') as f:
train_data = json.load(f)
# 获取训练集图片的ID列表
train_ids = [image['id'] for image in train_data['images']]
original_floder = '/mnt/all/dataset/360IC/coco_360_small/train2014'
target_floder = '/mnt/all/dataset/360IC/coco_360_small/train'
for image_id in train_ids:
image_filename = f'{image_id}.jpg'
source_path = os.path.join(original_floder,image_filename)
target_path = os.path.join(target_floder,image_filename)
shutil.copyfile(source_path,target_path)
import json
import os
import random
import numpy as np
train_data = {
'info': [],
'license': [],
'images': [],
'annotations':[]
}
val_data = {
'info': [],
'license': [],
'images': [],
'annotations':[]
}
txt_path = '/mnt/all/lzc/360IC_caption/'
txt_list = os.listdir(txt_path)
# count = 0
# caption id
train_id = []
val_id = []
for txt in txt_list:
with open(os.path.join(txt_path,txt), 'r') as f:
captions = f.read()
captions = captions.split('\n')
v2_images = {
'id':txt.replace('.txt',''),
'file_name':txt.replace('.txt','.jpg')
}
count = int(txt.replace('.txt',''))
if count<1000:
train_data['images'].append(v2_images)
for caption in captions:
if len(caption)>5:
count = count + 1
v2_annotation = {
'image_id': txt.replace('.txt',''),
'id': count,
'caption': caption[2:]
}
train_id.append(count)
train_data['annotations'].append(v2_annotation)
else:
val_data['images'].append(v2_images)
for caption in captions:
if len(caption)>5:
count = count + 1
v2_annotation = {
'image_id': txt.replace('.txt',''),
'id': count,
'caption': caption[2:]
}
val_id.append(count)
val_data['annotations'].append(v2_annotation)
with open('lzc_data/train.json', 'w') as f:
json.dump(train_data,f)
with open('lzc_data/val.json', 'w') as f:
json.dump(val_data,f)
np.save("lzc_data/train_ids.npy",np.array(train_id))
np.save("lzc_data/val_ids.npy",np.array(val_id))
Traceback (most recent call last):
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/mnt/all/lzc/grit3cp/train_caption.py", line 122, in main
train_res = train_xe(
File "/mnt/all/lzc/grit3cp/engine/caption_engine.py", line 330, in train_xe
out = model(batch['samples'], batch['captions'])
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/all/lzc/grit3cp/models/caption/transformer.py", line 65, in forward
vis_inputs = self.detector(images)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/all/lzc/grit3cp/models/caption/detector.py", line 74, in forward
outputs['reg_feat'] = self.self_att2(reg_feat,reg_feat,reg_feat)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/all/lzc/grit3cp/models/attention_c.py", line 116, in forward
out = self.attention(queries, keys, values, attention_mask, attention_weights)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/mnt/all/lzc/grit3cp/models/attention_c.py", line 62, in forward
q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4500x512 and 1024x512)
File "/mnt/all/lzc/grit3cp/train_caption.py", line 214, in run_main
mp.spawn(main, nprocs=config.exp.ngpus_per_node, args=(config,))
File "/mnt/all/lzc/grit3cp/train_caption.py", line 221, in <module>
run_main()
class ScaledDotProductAttention(nn.Module):
'''
Scaled dot-product attention
'''
def __init__(self, d_model, d_k, d_v, h, dropout=.1, comment=None):
'''
:param d_model: Output dimensionality of the model
:param d_k: Dimensionality of queries and keys
:param d_v: Dimensionality of values
:param h: Number of heads
'''
super(ScaledDotProductAttention, self).__init__()
self.fc_q = nn.Linear(d_model, h * d_k)
self.fc_k = nn.Linear(d_model, h * d_k)
self.fc_v = nn.Linear(d_model, h * d_v)
self.fc_o = nn.Linear(h * d_v, d_model)
self.dropout = nn.Dropout(dropout)
self.d_model = d_model
self.d_k = d_k
self.d_v = d_v
self.h = h
self.init_weights()
self.comment = comment
def init_weights(self):
nn.init.xavier_uniform_(self.fc_q.weight)
nn.init.xavier_uniform_(self.fc_k.weight)
nn.init.xavier_uniform_(self.fc_v.weight)
nn.init.xavier_uniform_(self.fc_o.weight)
nn.init.constant_(self.fc_q.bias, 0)
nn.init.constant_(self.fc_k.bias, 0)
nn.init.constant_(self.fc_v.bias, 0)
nn.init.constant_(self.fc_o.bias, 0)
def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
'''
Computes
:param queries: Queries (b_s, nq, d_model)
:param keys: Keys (b_s, nk, d_model)
:param values: Values (b_s, nk, d_model)
:param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.
:param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).
:return:
'''
b_s, nq = queries.shape[:2]
nk = keys.shape[1]
q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k)
k = self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1) # (b_s, h, d_k, nk)
v = self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3) # (b_s, h, nk, d_v)
att = torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk)
if attention_weights is not None:
att = att * attention_weights
# if attention_mask is not None:
# att = att.masked_fill(attention_mask, -np.inf)
att = torch.softmax(att, -1)
att = self.dropout(att)
# if recorder.activate is True:
# recorder.record(att, comment=self.comment)
try:
out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v) # (b_s, nq, h*d_v)
except:
print(att.shape)
out = self.fc_o(out) # (b_s, nq, d_model)
return out
Traceback (most recent call last):
File "/home/computer/xtc/tsmbase/models/containers.py", line 71, in statefulness
yield
File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 162, in iter
word_logprob = self.model.step(t, self.selected_words, visual, None, mode='feedback', **kwargs)
File "/home/computer/xtc/tsmbase/models/transformer/transformer.py", line 79, in step
return self.decoder(it, self.enc_output, self.mask_enc) # 100%
File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/computer/xtc/tsmbase/models/transformer/decoders.py", line 86, in forward
seq = torch.arange(1, seq_len + 1).view(1, -1).expand(b_s, -1).to(input.device) # (b_s, seq_len)
RuntimeError: CUDA error: device-side assert triggered
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/computer/xtc/tsmbase/train_lzc.py", line 520, in <module>
scores = evaluate_metrics(model, dict_dataloader_val, text_field)
File "/home/computer/xtc/tsmbase/train_lzc.py", line 73, in evaluate_metrics
out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 5, out_size=1)
File "/home/computer/xtc/tsmbase/models/captioning_model.py", line 81, in beam_search
return bs.apply(visual, out_size, return_probs, **kwargs)
File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/contextlib.py", line 131, in __exit__
self.gen.throw(type, value, traceback)
File "/home/computer/xtc/tsmbase/models/containers.py", line 73, in statefulness
self.disable_statefulness()
File "/home/computer/xtc/tsmbase/models/containers.py", line 63, in disable_statefulness
m.disable_statefulness()
File "/home/computer/xtc/tsmbase/models/containers.py", line 64, in disable_statefulness
self._reset_states()
File "/home/computer/xtc/tsmbase/models/containers.py", line 50, in _reset_states
self._buffers[name] = self._state_defaults[name].clone().detach().to(self._buffers[name].device)
RuntimeError: CUDA error: device-side assert triggered
/opt/conda/conda-bld/pytorch_1614378083779/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:115: operator(): block: [0,0,0], thread: [9,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1614378083779/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:115: operator(): block: [0,0,0], thread: [14,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1614378083779/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:115: operator(): block: [0,0,0], thread: [18,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
Epoch 0 - evaluation: 0%| | 0/50 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/computer/xtc/tsmbase/models/containers.py", line 71, in statefulness
yield
File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 162, in iter
word_logprob = self.model.step(t, self.selected_words, visual, None, mode='feedback', **kwargs)
File "/home/computer/xtc/tsmbase/models/transformer/transformer.py", line 79, in step
return self.decoder(it, self.enc_output, self.mask_enc) # 100%
File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/computer/xtc/tsmbase/models/transformer/decoders.py", line 87, in forward
seq = torch.arange(1, seq_len + 1).view(1, -1).expand(b_s, -1).to(input.device) # (b_s, seq_len)
RuntimeError: CUDA error: device-side assert triggered
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/computer/xtc/tsmbase/train_lzc.py", line 520, in <module>
scores = evaluate_metrics(model, dict_dataloader_val, text_field)
File "/home/computer/xtc/tsmbase/train_lzc.py", line 73, in evaluate_metrics
out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 5, out_size=1)
File "/home/computer/xtc/tsmbase/models/captioning_model.py", line 81, in beam_search
return bs.apply(visual, out_size, return_probs, **kwargs)
File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/contextlib.py", line 131, in __exit__
self.gen.throw(type, value, traceback)
File "/home/computer/xtc/tsmbase/models/containers.py", line 73, in statefulness
self.disable_statefulness()
File "/home/computer/xtc/tsmbase/models/containers.py", line 63, in disable_statefulness
m.disable_statefulness()
File "/home/computer/xtc/tsmbase/models/containers.py", line 64, in disable_statefulness
self._reset_states()
File "/home/computer/xtc/tsmbase/models/containers.py", line 50, in _reset_states
self._buffers[name] = self._state_defaults[name].clone().detach().to(self._buffers[name].device)
RuntimeError: CUDA error: device-side assert triggered
with tqdm(desc=f'Epoch {epoch} - train', unit='it', total=len(dataloaders['train'])) as pbar:
for it, batch in enumerate(dataloaders['train']):
out = model(batch['samples'], batch['captions'])
optimizers['model'].zero_grad()
optimizers['backbone'].zero_grad()
captions_gt = batch['captions'][:, 1:].contiguous()
out = out[:, :-1].contiguous()
loss = loss_fn(out.view(-1, len(text_field.vocab)), captions_gt.view(-1))
loss.backward()
optimizers['model'].step()
optimizers['backbone'].step()
loss = gather_result(loss)
running_loss += loss.item()
pbar.set_postfix(loss=running_loss / (it + 1))
pbar.update()
if scheduler is not None:
lr = scheduler.step()
assert optimizers['model'].param_groups[0]['lr'] == lr, "LR scheduler doesn't work properly."
if rank == 0:
writer.add_scalar(
'backbone_lr',
optimizers['backbone'].param_groups[0]['lr'],
epoch * len(dataloaders['train']) + it,
)
writer.add_scalar(
'model_lr',
optimizers['model'].param_groups[0]['lr'],
epoch * len(dataloaders['train']) + it,
)
lr = optimizers['model'].param_groups[0]['lr']
# break
val_loss = evaluate_loss(model, dataloaders['valid'], loss_fn, text_field, epoch, writer)
if rank == 0:
save_checkpoint(
model=model,
optimizers=optimizers,
epoch=epoch,
scores=[],
best_ciders=(0, 0),
config=config,
filename='checkpoint_last.pth',
scheduler=scheduler,
)
torch.distributed.barrier()
return {
'loss': running_loss / len(dataloaders['train']),
'reward': 0,
'reward_baseline': 0,
'val_loss': val_loss,
}
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from models.common.swin_model import *
from utils.misc import nested_tensor_from_tensor_list, NestedTensor
from models.detection.det_module import build_det_module_with_config
from models.attention import MultiHeadAttention
class Detector(nn.Module):
def __init__(
self,
backbone,
det_module=None,
use_gri_feat=True,
use_reg_feat=True,
hidden_dim=256,
):
super().__init__()
self.backbone = backbone
self.use_gri_feat = use_gri_feat
self.use_reg_feat = use_reg_feat
self.self_att = MultiHeadAttention(1024, 64, 64, 8, 0.1, can_be_stateful=False)
self.self_att2 = MultiHeadAttention(512, 64, 64, 8, 0.1, can_be_stateful=False)
# self.window_size=[6,6]
# self.relative_position_bias_table = nn.Parameter(
# torch.zeros(3*self.window_size[0]*self.window_size[1]))
# get pair-wise relative position index for each token inside the window
# coords_h = torch.arange(self.window_size[0])
# coords_w = torch.arange(self.window_size[1])
# coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
# coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
# relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
# relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
# relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
# relative_coords[:, :, 1] += self.window_size[1] - 1
# relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
# self.position_embeding = nn.Linear(1024,1024)
# self.position_embeding = nn.LayerNorm(1,6,18,1024)
if self.use_reg_feat:
self.det_module = det_module
self.input_proj = nn.ModuleList([
nn.Sequential(
nn.Conv2d(backbone.num_channels[i], hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim),
) for i in range(len(backbone.num_channels))
])
# def positional_encoding_2d(self, height, width, d_model, m):
# position_encoding = np.zeros((height, width, d_model))
# # 计算位置编码的奇数位和偶数位
# for pos in range(d_model // 2):
# for i in range(height):
# for j in range(width):
# angle = pos / np.power(10000, 2 * (pos // 2) / d_model)
# position_encoding[i, j, pos * 2] = np.sin(angle * i)
# position_encoding[i, j, pos * 2 + 1] = np.cos(angle * i)
# if 1/10 < m < 1/4:
# position_encoding[:, 6, :] = position_encoding[:, 7, :]
# position_encoding[:, 12, :],position_encoding[:, 13, :] = position_encoding[:, 13, :],position_encoding[:, 12, :]
# elif 1/4 < m < 5/12:
# position_encoding[:, 5, :] = position_encoding[:, 7, :]
# position_encoding[:, 6, :] = position_encoding[:, 8, :]
# position_encoding[:, 11, :],position_encoding[:, 13, :] = position_encoding[:, 13, :],position_encoding[:, 11, :]
# position_encoding[:, 12, :],position_encoding[:, 14, :] = position_encoding[:, 14, :],position_encoding[:, 12, :]
# return position_encoding
def forward(self, images: NestedTensor):
# - images.tensor: batched images, of shape [batch_size x 3 x H x W]
# - images.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
device = torch.device(f"cuda:{0}")
torch.cuda.set_device(0)
if isinstance(images, (list, torch.Tensor)):
samples = [img for img in images]
samples = nested_tensor_from_tensor_list(samples)
x = images.tensors
B, _, _, _ = x.shape
x = x.reshape([-1,3,384,384])# (B,9,384,384)->(3B,3,384,384)
mask = images.mask # padding mask [B, H, W]
mask = repeat(mask, 'b h w -> (repeat b) h w',repeat=3)
features = self.backbone(x)
# relative_position_bias = self.relative_position_bias_table.view(
# 6,18, -1) # Wh*Ww,Wh*Ww,nH
masks = [
F.interpolate(mask[None].float(), size=f.shape[-2:]).to(torch.bool)[0] for l, f in enumerate(features)
] # masks [[B, Hi, Wi]]
outputs = {}
# b, c, h, w = features[-1].shape
# relative_position_bias = relative_position_bias.unsqueeze(0).repeat(int(b/3),1,1,c) # nH, Wh*Ww, Wh*Ww
# features[-1] = features[-1].permute(0,2,3,1).reshape(int(b/3),h,3*w,c)+relative_position_bias
# features[-1] = features[-1].permute(0,3,1,2).reshape(b,c,h,w)
outputs['gri_feat'] = rearrange(features[-1], 'b c h w -> b (h w) c').reshape([B,-1,1024]) #(3B,1024,6,6)->(3B,36,1024)->(B,108,1024)
# gri_feat= gri_feat.reshape(B, h, 3*w, c) #(B, h, 3w, c)
# position_feature =self.positional_encoding_2d(h, 3*w, 1024, m)
# position_feature = torch.from_numpy(position_feature).unsqueeze(0).expand(B, h, 3*w, c)
# position_feature = position_feature.to(gri_feat.dtype)
# position_feature = position_feature.to(device)
# position_feature = position_feature.reshape([B,-1,1024])
# position_feature = self.position_embeding(position_feature)
# position_feature = position_feature.reshape(B, h, 3*w, c)
# outputs['gri_feat'] = (gri_feat + position_feature).reshape([B,-1,1024])
# outputs['gri_feat'] = self.self_att(gri_feat,gri_feat,gri_feat)
outputs['gri_mask'] = repeat(masks[-1], 'b h w -> b 1 1 (h w)').reshape(B,1,1,-1)
if self.use_reg_feat:
srcs = [self.input_proj[l](src) for l, src in enumerate(features)]
hs, _, _ = self.det_module(srcs, masks)
reg_feat = hs[-1].reshape(B,-1,512)
outputs['reg_feat'] = self.self_att2(reg_feat,reg_feat,reg_feat)
outputs['reg_mask'] = hs[-1].data.new_full((hs[-1].shape[0], 1, 1, hs[-1].shape[1]), 0).bool().reshape(B,1,1,-1)
return outputs
def build_detector(config):
pos_dim = getattr(config.model.detector, 'pos_dim', None)
backbone, _ = swin_base_win7_384(
frozen_stages=config.model.frozen_stages,
pos_dim=pos_dim,
)
det_cfg = config.model.detector
det_module = build_det_module_with_config(det_cfg) if config.model.use_reg_feat else None #detector
detector = Detector(
backbone,
det_module=det_module,
hidden_dim=config.model.d_model,
use_gri_feat=config.model.use_gri_feat,
use_reg_feat=config.model.use_reg_feat,
)
if os.path.exists(config.model.detector.checkpoint):
checkpoint = torch.load(config.model.detector.checkpoint, map_location='cpu')
missing, unexpected = detector.load_state_dict(checkpoint['model'], strict=False)
print(f"Loading weights for detector: missing: {len(missing)}, unexpected: {len(unexpected)}.")
return detector
# ------------------------------------------------------------------------
# GRIT: Faster and Better Image captioning Transformer
# Licensed under the Creative Commons Attribution.
# ------------------------------------------------------------------------
# Modified from Meshed Memory Transformer
# https://github.com/aimagelab/meshed-memory-transformer
# ------------------------------------------------------------------------
import numpy as np
import torch
from torch import nn
from models.caption.containers import Module
from einops import rearrange, repeat
def init_params(module):
for name, param in module.named_parameters():
if 'weight' in name:
nn.init.xavier_uniform_(param)
elif 'bias' in name:
nn.init.constant_(param, 0)
elif 'm_' in name: # for memory
nn.init.normal_(param, mean=0, std=0.01)
class Attention(nn.Module):
'''
Scaled dot-product attention
'''
def __init__(self, d_model, n_heads, dropout=0.2, n_memories=0):
super().__init__()
self.fc_q = nn.Linear(d_model, d_model)
self.fc_k = nn.Linear(d_model, d_model)
self.fc_v = nn.Linear(d_model, d_model)
self.fc_o = nn.Linear(d_model, d_model)
# * adapted from Meshed-Memory Transformers; n_memories: # mem slots
if n_memories > 0:
self.m_k = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
self.m_v = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
self.dropout = nn.Dropout(p=dropout)
self.window_size = [6,18]
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), n_heads)) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
first_column = coords[1][:, 0]
other_columns = coords[1][:, 1:]
coords[1] = torch.cat((other_columns,first_column.view(-1,1)),dim=1)
# m = 0
# if 1/10 < m < 1/4:
# coords[1][:, 5] = coords[1][:, 6]
# coords[1][:, 11],coords[1][:, 12] = coords[1][:, 12],coords[1][:, 11].clone()
# elif 1/4 < m < 5/12:
# coords[1][:, 4] = coords[1][:, 6]
# coords[1][:, 5] = coords[1][:, 7]
# coords[1][:, 10],coords[1][:, 12] = coords[1][:, 12],coords[1][:, 10].clone()
# coords[1][:, 11],coords[1][:, 13] = coords[1][:, 13],coords[1][:, 11].clone()
coords = torch.stack((coords[0],coords[1]), dim=0)
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.d_model = d_model
self.n_heads = n_heads
self.n_memories = n_memories
self.d_k = d_model // n_heads
self.apply(init_params)
def forward(self, q, k, v, attention_mask=None):
# q, k, v: (b, n, d_model), mask: (b, n, n)
nq, nk = q.shape[1], k.shape[1]
if self.n_memories > 0:
m_k = repeat(self.m_k, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.d_k)
m_v = repeat(self.m_v, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.n_memories)
q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)
k = torch.cat([self.fc_k(k), m_k], 1)
v = torch.cat([self.fc_v(v), m_v], 1)
k = rearrange(k, 'b nk (head d) -> b head d nk', head=self.n_heads)
v = rearrange(v, 'b nv (head d) -> b head nv d', head=self.n_heads)
scores = torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk)
# if attention_weights is not None:
# scores = torch.cat([scores[:, :, :, :nk] * attention_weights, scores[:, :, :, nk:]], dim=-1)
if attention_mask is not None:
scores[:, :, :, :nk] = scores[:, :, :, :nk].masked_fill(attention_mask.bool(), -np.inf)
else:
q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)
k = rearrange(self.fc_k(k), 'b nk (head d) -> b head d nk', head=self.n_heads)
v = rearrange(self.fc_v(v), 'b nv (head d) -> b head nv d', head=self.n_heads)
scores = torch.matmul(q, k) / np.sqrt(self.d_k) # [b h nq nk]
# if attention_weights is not None:
# scores = scores * attention_weights
if attention_mask is not None:
scores = scores.masked_fill(attention_mask.bool(), -np.inf)
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
relative_position_bias =relative_position_bias.unsqueeze(0)
p_attn = torch.softmax(scores, -1)
p_attn = self.dropout(p_attn)
if p_attn.size(2) ==108:
p_attn = p_attn + relative_position_bias
# [b h nq nk] * [b h nk dv] = [b h nq dv] -> [b nq h dv] -> [b nq h*dv]
out = rearrange(torch.matmul(p_attn, v), 'b h nq dv -> b nq (h dv)')
out = self.fc_o(out) # (b_s, nq, d_model)
return out
class MemoryAttention(nn.Module):
def __init__(self, d_model, n_heads, n_memories, dropout=0.0):
# * adapted from Meshed-Memory Transformers; n_memories: # mem slots
super().__init__()
self.fc_q = nn.Linear(d_model, d_model)
self.fc_k = nn.Linear(d_model, d_model)
self.fc_v = nn.Linear(d_model, d_model)
self.fc_o = nn.Linear(d_model, d_model)
if n_memories > 0:
self.m_k = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
self.m_v = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
self.dropout = nn.Dropout(p=dropout)
self.d_model = d_model
self.n_heads = n_heads
self.n_memories = n_memories
self.d_k = d_model // n_heads
self.apply(init_params)
def forward(self, q, k, v, attention_mask=None, attention_weights=None):
# q, k, v: (b, n, d_model), mask: (b, n, n) - True indicates masking
b_s, nq = q.shape[:2]
nk = k.shape[1]
if self.n_memories > 0:
m_k = repeat(self.m_k, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.d_k)
m_v = repeat(self.m_v, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.n_memories)
q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)
k = torch.cat([self.fc_k(k), m_k], 1)
v = torch.cat([self.fc_v(v), m_v], 1)
k = rearrange(k, 'b nk (head d) -> b head d nk', head=self.n_heads)
v = rearrange(v, 'b nv (head d) -> b head nv d', head=self.n_heads)
scores = torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk)
if attention_weights is not None:
scores = torch.cat([scores[:, :, :, :nk] * attention_weights, scores[:, :, :, nk:]], dim=-1)
if attention_mask is not None:
scores[:, :, :, :nk] = scores[:, :, :, :nk].masked_fill(attention_mask.bool(), -np.inf)
else:
q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)
k = rearrange(self.fc_k(k), 'b nk (head d) -> b head d nk', head=self.n_heads)
v = rearrange(self.fc_v(v), 'b nv (head d) -> b head nv d', head=self.n_heads)
scores = torch.matmul(q, k) / np.sqrt(self.d_k) # [b h nq nk]
if attention_weights is not None:
scores = scores * attention_weights
if attention_mask is not None:
scores = scores.masked_fill(attention_mask.bool(), -np.inf)
p_attn = torch.softmax(scores, dim=-1)
p_attn = self.dropout(p_attn)
# [b h nq nk] * [b h nk dv] = [b h nq dv] -> [b nq h dv] -> [b nq h*dv]
out = rearrange(torch.matmul(p_attn, v), 'b h nq dv -> b nq (h dv)')
out = self.fc_o(out) # (b_s, nq, d_model)
return out
class MultiHeadAttention(Module):
def __init__(self, d_model, n_heads, dropout=.1, n_memories=0, can_be_stateful=False):
super().__init__()
self.attention = Attention(d_model=d_model, n_heads=n_heads, dropout=dropout, n_memories=n_memories)
self.dropout = nn.Dropout(p=dropout)
self.layer_norm = nn.LayerNorm(d_model)
self.can_be_stateful = can_be_stateful
if self.can_be_stateful: # store prev computed K & V for fast inference
self.register_state('running_keys', torch.zeros((1, d_model)))
self.register_state('running_values', torch.zeros((1, d_model)))
def forward(self, queries, keys, values, attention_mask=None):
if self.can_be_stateful and self._is_stateful:
# keys, values: from the current input token: [B, 1, D]
# running_keys, values: from prev tokens: [B, t-1, D]
self.running_keys = torch.cat([self.running_keys, keys], 1)
self.running_values = torch.cat([self.running_values, values], 1)
if self.timestep == 0:
keys = self.running_keys = self.running_keys[:, 1:] # [B t D]
values = self.running_values = self.running_values[:, 1:] # [B t D]
else:
keys = self.running_keys # [B t D]
values = self.running_values # [B t D]
self.timestep += 1
out = self.attention(queries, keys, values, attention_mask)
out = self.dropout(out)
out = self.layer_norm(queries + out)
return out