json文件

import json

def compare_image_ids(json_data1, json_data2):
    image_ids1 = extract_image_ids(json_data1)
    image_ids2 = extract_image_ids(json_data2)

    set_image_ids1 = set(image_ids1)
    set_image_ids2 = set(image_ids2)

    different_image_ids = set_image_ids1.symmetric_difference(set_image_ids2)

    return different_image_ids

def extract_image_ids(json_data):
    image_ids = []

    # Extract image_ids from images
    for image in json_data.get("images", []):
        image_id = image.get("id")
        if image_id:
            image_ids.append(int(image_id))

    # Extract image_ids from annotations
    for annotation in json_data.get("annotations", []):
        image_id = annotation.get("image_id")
        if image_id:
            image_ids.append(int(image_id))

    return image_ids

if __name__ == "__main__":
    # Replace these paths with the actual paths to your JSON files
    json_file_path1 = "path/to/your/first/file.json"
    json_file_path2 = "path/to/your/second/file.json"

    with open(json_file_path1, "r") as file:
        json_data1 = json.load(file)

    with open(json_file_path2, "r") as file:
        json_data2 = json.load(file)

    different_image_ids = compare_image_ids(json_data1, json_data2)

    if different_image_ids:
        print("Different image_ids:")
        for image_id in different_image_ids:
            print(image_id)
    else:
        print("Image_ids are the same in both JSON files.")
model = build_model(opt)
pretrained_dict = torch.load(pretrained_checkpoint_path, map_location='cpu') # 加载到CPU
model_dict = model.state_dict() # 模型参数.
# 记载特定的权重.
pretrained_dict = {key: value for key, value in pretrained_dict.items() if (key in model_dict and 'det' not in key)}
model_dict.update(pretrained_dict)
missing, unexpected = model.load_state_dict(model_dict, strict=True)        
print(f"Loading weights for detector: missing: {len(missing)}, unexpected: {len(unexpected)}.")
import os
import shutil
import numpy as np
import json

# Paths
existing_images_dir = 'path_to_existing_images_directory'
new_images_dir = 'path_to_new_images_directory'
output_images_dir = 'path_to_output_images_directory'
existing_val_ids = np.load('val_ids.npy')

# Load existing data
with open('val.json', 'r') as json_file:
    existing_val_data = json.load(json_file)

# Add new data
new_val_ids = []
for i in range(1151, 1251):
    # Load caption from file
    caption_file_path = f'{i}.txt'
    with open(caption_file_path, 'r') as caption_file:
        caption = caption_file.read().strip()

    # Copy new image to output directory
    new_image_path = os.path.join(new_images_dir, f'{i}.jpg')
    output_image_path = os.path.join(output_images_dir, f'{i}.jpg')
    shutil.copy(new_image_path, output_image_path)

    # Add new image to data
    new_image_data = {
        "file_name": f"{i}.jpg",
        "id": len(existing_val_data["images"]) + len(new_val_ids),
        "caption": caption
    }
    existing_val_data["images"].append(new_image_data)
    new_val_ids.append(new_image_data["id"])

# Combine old and new IDs
updated_val_ids = np.concatenate((existing_val_ids, new_val_ids))

# Save updated IDs and data
np.save('val_ids.npy', updated_val_ids)
with open('val.json', 'w') as json_file:
    json.dump(existing_val_data, json_file, indent=4)
import json
import numpy as np

# 加载之前的150张测试集val.json文件
with open('lzc_data/val.json', 'r') as f:
    val_data = json.load(f)

# 加载新的100张测试集test_new.json文件和测试集IDs文件test_new_ids.npy
with open('lzc_data/test_new.json', 'r') as f:
    test_new_data = json.load(f)

test_new_ids = np.load("lzc_data/test_new_ids.npy")

# 将新的100张测试集添加到之前测试集的images列表和annotations列表中
val_data['images'].extend(test_new_data['images'])
val_data['annotations'].extend(test_new_data['annotations'])

# 将新的100张测试集的IDs添加到之前测试集的IDs中
val_ids = np.concatenate((val_data['images'], test_new_ids))

# 保存合并后的测试集为merged_val.json文件
with open('lzc_data/merged_val.json', 'w') as f:
    json.dump(val_data, f)

# 将合并后的测试集的IDs保存为npy文件
np.save("lzc_data/merged_val_ids.npy", val_ids)
import json
import numpy as np

# 加载之前的150张测试集val.json文件
with open('lzc_data/val.json', 'r') as f:
    val_data = json.load(f)

# 加载新的100张测试集test_new.json文件和测试集IDs文件test_new_ids.npy
with open('lzc_data/test_new.json', 'r') as f:
    test_new_data = json.load(f)

test_new_ids = np.load("lzc_data/test_new_ids.npy")

# 将新的100张测试集添加到之前测试集的images列表和annotations列表中
val_data['images'].extend(test_new_data['images'])
val_data['annotations'].extend(test_new_data['annotations'])

# 将新的100张测试集的IDs添加到之前测试集的IDs中
val_ids = np.concatenate((val_data['images'], test_new_ids))

# 保存合并后的测试集为merged_val.json文件
with open('lzc_data/merged_val.json', 'w') as f:
    json.dump(val_data, f)

# 将合并后的测试集的IDs保存为npy文件
np.save("lzc_data/merged_val_ids.npy", val_ids)
import json
import numpy as np
import random

# 加载训练集的json文件和npy文件
with open('lzc_data/train.json', 'r') as f:
    train_data = json.load(f)

# 获取训练集图片的ID列表
train_ids = [image['id'] for image in train_data['images']]

# 从1000张训练集中随机抽取100张作为新的测试集
random_test_indices = random.sample(range(len(train_ids)), 100)
test_data_subset = [train_data['images'][i] for i in random_test_indices]
test_annotations_subset = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in test_data_subset]]

# 将新的测试集保存到文件
with open('lzc_data/test.json', 'w') as f:
    json.dump({
        'info': train_data['info'],
        'license': train_data['license'],
        'images': test_data_subset,
        'annotations': test_annotations_subset
    }, f)

# 保存新的100张测试集的IDs为npy文件
test_ids = [train_ids[i] for i in random_test_indices]
np.save("lzc_data/test_ids.npy", test_ids)

# 剩下的900张训练集
remaining_indices = [i for i in range(len(train_ids)) if i not in random_test_indices]
train_data_remaining = [train_data['images'][i] for i in remaining_indices]
train_annotations_remaining = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in train_data_remaining]]

# 将剩下的900张训练集保存到文件
with open('lzc_data/train.json', 'w') as f:
    json.dump({
        'info': train_data['info'],
        'license': train_data['license'],
        'images': train_data_remaining,
        'annotations': train_annotations_remaining
    }, f)

# 将剩下的900张训练集的IDs保存为npy文件
remaining_train_ids = [train_ids[i] for i in remaining_indices]
np.save("lzc_data/train_ids.npy", remaining_train_ids)
import json
import numpy as np
import random

# 加载训练集的json文件和npy文件
with open('lzc_data/train.json', 'r') as f:
    train_data = json.load(f)

# 获取训练集图片的ID列表
train_ids = [image['id'] for image in train_data['images']]

# 从1000张训练集中随机抽取100张作为新的测试集
random_test_indices = random.sample(range(len(train_ids)), 100)
test_data_subset = [train_data['images'][i] for i in random_test_indices]
test_annotations_subset = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in test_data_subset]]

# 将新的测试集保存到文件
with open('lzc_data/test.json', 'w') as f:
    json.dump({
        'info': train_data['info'],
        'license': train_data['license'],
        'images': test_data_subset,
        'annotations': test_annotations_subset
    }, f)

# 保存新的100张测试集的IDs为npy文件
test_ids = [train_ids[i] for i in random_test_indices]
np.save("lzc_data/test_ids.npy", test_ids)

# 剩下的900张训练集
remaining_indices = [i for i in range(len(train_ids)) if i not in random_test_indices]
train_data_remaining = [train_data['images'][i] for i in remaining_indices]
train_annotations_remaining = [annotation for annotation in train_data['annotations'] if annotation['image_id'] in [item['id'] for item in train_data_remaining]]

# 将剩下的900张训练集保存到文件
with open('lzc_data/train.json', 'w') as f:
    json.dump({
        'info': train_data['info'],
        'license': train_data['license'],
        'images': train_data_remaining,
        'annotations': train_annotations_remaining
    }, f)

# 将剩下的900张训练集的IDs保存为npy文件
remaining_train_ids = [train_ids[i] for i in remaining_indices]
np.save("lzc_data/train_ids.npy", remaining_train_ids)
[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:6688 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to ?UNKNOWN? (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
Error executing job with overrides: []
Traceback (most recent call last):
  File "/mnt/all/lzc/grit/train_caption.py", line 212, in run_main
    mp.spawn(main, nprocs=config.exp.ngpus_per_node, args=(config,))
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
    while not context.join():
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 160, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/mnt/all/lzc/grit/train_caption.py", line 29, in main
    dist.init_process_group('nccl', 'env://', rank=rank, world_size=config.exp.world_size)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 754, in init_process_group
    store, rank, world_size = next(rendezvous_iterator)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 246, in _env_rendezvous_handler
    store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 177, in _create_c10d_store
    return TCPStore(
RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:6688 (errno: 98 - Address already in use). The server socket has failed to bind to ?UNKNOWN? (errno: 98 - Address already in use).


Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
import os
import shutil
import numpy as np
import json




with open('/mnt/all/dataset/360IC/coco_360_small/annotations/train.json', 'r') as f:
    train_data = json.load(f)

# 获取训练集图片的ID列表
train_ids = [image['id'] for image in train_data['images']]


original_floder = '/mnt/all/dataset/360IC/coco_360_small/train2014'
target_floder = '/mnt/all/dataset/360IC/coco_360_small/train'

for image_id in train_ids:
                image_filename = f'{image_id}.jpg'
                
                source_path = os.path.join(original_floder,image_filename)
                target_path = os.path.join(target_floder,image_filename)

                shutil.copyfile(source_path,target_path)
import json
import os
import random
import numpy as np
train_data = {
   'info': [],
   'license': [],
   'images': [],
   'annotations':[]
}

val_data = {
   'info': [],
   'license': [],
   'images': [],
   'annotations':[]
}

txt_path = '/mnt/all/lzc/360IC_caption/'
txt_list = os.listdir(txt_path)
# count = 0

# caption id
train_id = []
val_id = []

for txt in txt_list:
    with open(os.path.join(txt_path,txt), 'r') as f:
                captions = f.read()
                captions = captions.split('\n')

    v2_images = {
        'id':txt.replace('.txt',''),
        'file_name':txt.replace('.txt','.jpg')
    }
    count = int(txt.replace('.txt',''))
    if count<1000:
        train_data['images'].append(v2_images)
        for caption in captions:
            if len(caption)>5:
                count = count + 1
                v2_annotation = {
                    'image_id': txt.replace('.txt',''),
                    'id': count,
                    'caption': caption[2:]
                }
                train_id.append(count)
                train_data['annotations'].append(v2_annotation)
    else:
        val_data['images'].append(v2_images)
        for caption in captions:
            if len(caption)>5:
                count = count + 1
                v2_annotation = {
                    'image_id': txt.replace('.txt',''),
                    'id': count,
                    'caption': caption[2:]
                }
                val_id.append(count)
                val_data['annotations'].append(v2_annotation)    


with open('lzc_data/train.json', 'w') as f:
    json.dump(train_data,f)

with open('lzc_data/val.json', 'w') as f:
    json.dump(val_data,f)

np.save("lzc_data/train_ids.npy",np.array(train_id))
np.save("lzc_data/val_ids.npy",np.array(val_id))


Traceback (most recent call last):
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/mnt/all/lzc/grit3cp/train_caption.py", line 122, in main
    train_res = train_xe(
  File "/mnt/all/lzc/grit3cp/engine/caption_engine.py", line 330, in train_xe
    out = model(batch['samples'], batch['captions'])
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
    output = self._run_ddp_forward(*inputs, **kwargs)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
    return module_to_run(*inputs[0], **kwargs[0])
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/mnt/all/lzc/grit3cp/models/caption/transformer.py", line 65, in forward
    vis_inputs = self.detector(images)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/mnt/all/lzc/grit3cp/models/caption/detector.py", line 74, in forward
    outputs['reg_feat'] = self.self_att2(reg_feat,reg_feat,reg_feat)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/mnt/all/lzc/grit3cp/models/attention_c.py", line 116, in forward
    out = self.attention(queries, keys, values, attention_mask, attention_weights)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/mnt/all/lzc/grit3cp/models/attention_c.py", line 62, in forward
    q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3)  # (b_s, h, nq, d_k)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/computer/.conda/envs/grit2/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4500x512 and 1024x512)
  File "/mnt/all/lzc/grit3cp/train_caption.py", line 214, in run_main
    mp.spawn(main, nprocs=config.exp.ngpus_per_node, args=(config,))
  File "/mnt/all/lzc/grit3cp/train_caption.py", line 221, in <module>
    run_main()
class ScaledDotProductAttention(nn.Module):
    '''
    Scaled dot-product attention
    '''

    def __init__(self, d_model, d_k, d_v, h, dropout=.1, comment=None):
        '''
        :param d_model: Output dimensionality of the model
        :param d_k: Dimensionality of queries and keys
        :param d_v: Dimensionality of values
        :param h: Number of heads
        '''
        super(ScaledDotProductAttention, self).__init__()
        self.fc_q = nn.Linear(d_model, h * d_k)
        self.fc_k = nn.Linear(d_model, h * d_k)
        self.fc_v = nn.Linear(d_model, h * d_v)
        self.fc_o = nn.Linear(h * d_v, d_model)
        self.dropout = nn.Dropout(dropout)

        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_v
        self.h = h

        self.init_weights()

        self.comment = comment

    def init_weights(self):
        nn.init.xavier_uniform_(self.fc_q.weight)
        nn.init.xavier_uniform_(self.fc_k.weight)
        nn.init.xavier_uniform_(self.fc_v.weight)
        nn.init.xavier_uniform_(self.fc_o.weight)
        nn.init.constant_(self.fc_q.bias, 0)
        nn.init.constant_(self.fc_k.bias, 0)
        nn.init.constant_(self.fc_v.bias, 0)
        nn.init.constant_(self.fc_o.bias, 0)

    def forward(self, queries, keys, values, attention_mask=None, attention_weights=None):
        '''
        Computes
        :param queries: Queries (b_s, nq, d_model)
        :param keys: Keys (b_s, nk, d_model)
        :param values: Values (b_s, nk, d_model)
        :param attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking.
        :param attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk).
        :return:
        '''
        b_s, nq = queries.shape[:2]
        nk = keys.shape[1]

        q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3)  # (b_s, h, nq, d_k)
        k = self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1)  # (b_s, h, d_k, nk)
        v = self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3)  # (b_s, h, nk, d_v)

        att = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
        if attention_weights is not None:
            att = att * attention_weights
        # if attention_mask is not None:
        #     att = att.masked_fill(attention_mask, -np.inf)
        att = torch.softmax(att, -1)
        att = self.dropout(att)
        # if recorder.activate is True:
        #     recorder.record(att, comment=self.comment)
        try:
            out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v)  # (b_s, nq, h*d_v)
        except:
            print(att.shape)
        out = self.fc_o(out)  # (b_s, nq, d_model)
        return out
Traceback (most recent call last):
  File "/home/computer/xtc/tsmbase/models/containers.py", line 71, in statefulness
    yield
  File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
    visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
  File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 162, in iter
    word_logprob = self.model.step(t, self.selected_words, visual, None, mode='feedback', **kwargs)
  File "/home/computer/xtc/tsmbase/models/transformer/transformer.py", line 79, in step
    return self.decoder(it, self.enc_output, self.mask_enc)  # 100%
  File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/computer/xtc/tsmbase/models/transformer/decoders.py", line 86, in forward
    seq = torch.arange(1, seq_len + 1).view(1, -1).expand(b_s, -1).to(input.device)  # (b_s, seq_len)
RuntimeError: CUDA error: device-side assert triggered

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/computer/xtc/tsmbase/train_lzc.py", line 520, in <module>
    scores = evaluate_metrics(model, dict_dataloader_val, text_field)
  File "/home/computer/xtc/tsmbase/train_lzc.py", line 73, in evaluate_metrics
    out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 5, out_size=1)
  File "/home/computer/xtc/tsmbase/models/captioning_model.py", line 81, in beam_search
    return bs.apply(visual, out_size, return_probs, **kwargs)
  File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
    visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
  File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/contextlib.py", line 131, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/computer/xtc/tsmbase/models/containers.py", line 73, in statefulness
    self.disable_statefulness()
  File "/home/computer/xtc/tsmbase/models/containers.py", line 63, in disable_statefulness
    m.disable_statefulness()
  File "/home/computer/xtc/tsmbase/models/containers.py", line 64, in disable_statefulness
    self._reset_states()
  File "/home/computer/xtc/tsmbase/models/containers.py", line 50, in _reset_states
    self._buffers[name] = self._state_defaults[name].clone().detach().to(self._buffers[name].device)
RuntimeError: CUDA error: device-side assert triggered
/opt/conda/conda-bld/pytorch_1614378083779/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:115: operator(): block: [0,0,0], thread: [9,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1614378083779/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:115: operator(): block: [0,0,0], thread: [14,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
/opt/conda/conda-bld/pytorch_1614378083779/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:115: operator(): block: [0,0,0], thread: [18,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && "index out of bounds"` failed.
Epoch 0 - evaluation:   0%|                                                                          | 0/50 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/home/computer/xtc/tsmbase/models/containers.py", line 71, in statefulness
    yield
  File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
    visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
  File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 162, in iter
    word_logprob = self.model.step(t, self.selected_words, visual, None, mode='feedback', **kwargs)
  File "/home/computer/xtc/tsmbase/models/transformer/transformer.py", line 79, in step
    return self.decoder(it, self.enc_output, self.mask_enc)  # 100%
  File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/computer/xtc/tsmbase/models/transformer/decoders.py", line 87, in forward
    seq = torch.arange(1, seq_len + 1).view(1, -1).expand(b_s, -1).to(input.device)  # (b_s, seq_len)
RuntimeError: CUDA error: device-side assert triggered

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/computer/xtc/tsmbase/train_lzc.py", line 520, in <module>
    scores = evaluate_metrics(model, dict_dataloader_val, text_field)
  File "/home/computer/xtc/tsmbase/train_lzc.py", line 73, in evaluate_metrics
    out, _ = model.beam_search(images, 20, text_field.vocab.stoi['<eos>'], 5, out_size=1)
  File "/home/computer/xtc/tsmbase/models/captioning_model.py", line 81, in beam_search
    return bs.apply(visual, out_size, return_probs, **kwargs)
  File "/home/computer/xtc/tsmbase/models/beam_search/beam_search.py", line 117, in apply
    visual, outputs = self.iter(t, visual, outputs, return_probs, **kwargs)
  File "/usr/local/anaconda3/envs/Base_caption/lib/python3.8/contextlib.py", line 131, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/computer/xtc/tsmbase/models/containers.py", line 73, in statefulness
    self.disable_statefulness()
  File "/home/computer/xtc/tsmbase/models/containers.py", line 63, in disable_statefulness
    m.disable_statefulness()
  File "/home/computer/xtc/tsmbase/models/containers.py", line 64, in disable_statefulness
    self._reset_states()
  File "/home/computer/xtc/tsmbase/models/containers.py", line 50, in _reset_states
    self._buffers[name] = self._state_defaults[name].clone().detach().to(self._buffers[name].device)
RuntimeError: CUDA error: device-side assert triggered
with tqdm(desc=f'Epoch {epoch} - train', unit='it', total=len(dataloaders['train'])) as pbar:
        for it, batch in enumerate(dataloaders['train']):
            out = model(batch['samples'], batch['captions'])
            optimizers['model'].zero_grad()
            optimizers['backbone'].zero_grad()

            captions_gt = batch['captions'][:, 1:].contiguous()
            out = out[:, :-1].contiguous()
            loss = loss_fn(out.view(-1, len(text_field.vocab)), captions_gt.view(-1))
            loss.backward()

            optimizers['model'].step()
            optimizers['backbone'].step()

            loss = gather_result(loss)
            running_loss += loss.item()

            pbar.set_postfix(loss=running_loss / (it + 1))
            pbar.update()

            if scheduler is not None:
                lr = scheduler.step()
                assert optimizers['model'].param_groups[0]['lr'] == lr, "LR scheduler doesn't work properly."

            if rank == 0:
                writer.add_scalar(
                    'backbone_lr',
                    optimizers['backbone'].param_groups[0]['lr'],
                    epoch * len(dataloaders['train']) + it,
                )
                writer.add_scalar(
                    'model_lr',
                    optimizers['model'].param_groups[0]['lr'],
                    epoch * len(dataloaders['train']) + it,
                )
                lr = optimizers['model'].param_groups[0]['lr']
            # break
    val_loss = evaluate_loss(model, dataloaders['valid'], loss_fn, text_field, epoch, writer)

    if rank == 0:
        save_checkpoint(
            model=model,
            optimizers=optimizers,
            epoch=epoch,
            scores=[],
            best_ciders=(0, 0),
            config=config,
            filename='checkpoint_last.pth',
            scheduler=scheduler,
        )
    torch.distributed.barrier()

    return {
        'loss': running_loss / len(dataloaders['train']),
        'reward': 0,
        'reward_baseline': 0,
        'val_loss': val_loss,
    }
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat

from models.common.swin_model import *
from utils.misc import nested_tensor_from_tensor_list, NestedTensor
from models.detection.det_module import build_det_module_with_config
from models.attention import MultiHeadAttention



class Detector(nn.Module):

    def __init__(
        self,
        backbone,
        det_module=None,
        use_gri_feat=True,
        use_reg_feat=True,
        hidden_dim=256,
    ):
        super().__init__()
        self.backbone = backbone
        self.use_gri_feat = use_gri_feat
        self.use_reg_feat = use_reg_feat
        self.self_att = MultiHeadAttention(1024, 64, 64, 8, 0.1, can_be_stateful=False)
        self.self_att2 = MultiHeadAttention(512, 64, 64, 8, 0.1, can_be_stateful=False)
        # self.window_size=[6,6]
        # self.relative_position_bias_table = nn.Parameter(
        #     torch.zeros(3*self.window_size[0]*self.window_size[1]))
        # get pair-wise relative position index for each token inside the window
        # coords_h = torch.arange(self.window_size[0])
        # coords_w = torch.arange(self.window_size[1])
        # coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        # coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        # relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        # relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        # relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
        # relative_coords[:, :, 1] += self.window_size[1] - 1
        # relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        # self.position_embeding = nn.Linear(1024,1024)
        # self.position_embeding = nn.LayerNorm(1,6,18,1024)
        if self.use_reg_feat:
            self.det_module = det_module
            self.input_proj = nn.ModuleList([
                nn.Sequential(
                    nn.Conv2d(backbone.num_channels[i], hidden_dim, kernel_size=1),
                    nn.GroupNorm(32, hidden_dim),
                ) for i in range(len(backbone.num_channels))
            ])


    # def positional_encoding_2d(self, height, width, d_model, m):
    #     position_encoding = np.zeros((height, width, d_model))
      
    #     # 计算位置编码的奇数位和偶数位
    #     for pos in range(d_model // 2):
    #         for i in range(height):
    #             for j in range(width):
    #                 angle = pos / np.power(10000, 2 * (pos // 2) / d_model)
    #                 position_encoding[i, j, pos * 2] = np.sin(angle * i)
    #                 position_encoding[i, j, pos * 2 + 1] = np.cos(angle * i)

        # if 1/10 < m < 1/4:
        #     position_encoding[:, 6, :] = position_encoding[:, 7, :]
        #     position_encoding[:, 12, :],position_encoding[:, 13, :] = position_encoding[:, 13, :],position_encoding[:, 12, :]
        # elif 1/4 < m < 5/12:
        #     position_encoding[:, 5, :] = position_encoding[:, 7, :]
        #     position_encoding[:, 6, :] = position_encoding[:, 8, :]
        #     position_encoding[:, 11, :],position_encoding[:, 13, :] = position_encoding[:, 13, :],position_encoding[:, 11, :]
        #     position_encoding[:, 12, :],position_encoding[:, 14, :] = position_encoding[:, 14, :],position_encoding[:, 12, :]
                    
        # return position_encoding



    def forward(self, images: NestedTensor):
        # - images.tensor: batched images, of shape [batch_size x 3 x H x W]
        # - images.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

        device = torch.device(f"cuda:{0}")
        torch.cuda.set_device(0)

        if isinstance(images, (list, torch.Tensor)):
            samples = [img for img in images]
            samples = nested_tensor_from_tensor_list(samples)

        x = images.tensors  
        B, _, _, _ = x.shape
        x = x.reshape([-1,3,384,384])# (B,9,384,384)->(3B,3,384,384)
        mask = images.mask  # padding mask [B, H, W]
        mask = repeat(mask, 'b h w -> (repeat b) h w',repeat=3)
        features = self.backbone(x)

        # relative_position_bias = self.relative_position_bias_table.view(
        #     6,18, -1)  # Wh*Ww,Wh*Ww,nH

        masks = [
            F.interpolate(mask[None].float(), size=f.shape[-2:]).to(torch.bool)[0] for l, f in enumerate(features)
        ]  # masks [[B, Hi, Wi]]

        outputs = {}
        # b, c, h, w = features[-1].shape
        # relative_position_bias = relative_position_bias.unsqueeze(0).repeat(int(b/3),1,1,c) # nH, Wh*Ww, Wh*Ww

        # features[-1] = features[-1].permute(0,2,3,1).reshape(int(b/3),h,3*w,c)+relative_position_bias

        # features[-1] = features[-1].permute(0,3,1,2).reshape(b,c,h,w)
    
        outputs['gri_feat'] = rearrange(features[-1], 'b c h w -> b (h w) c').reshape([B,-1,1024]) #(3B,1024,6,6)->(3B,36,1024)->(B,108,1024)
        # gri_feat= gri_feat.reshape(B, h, 3*w, c) #(B, h, 3w, c)
        # position_feature =self.positional_encoding_2d(h, 3*w, 1024, m)
        # position_feature = torch.from_numpy(position_feature).unsqueeze(0).expand(B, h, 3*w, c)
        # position_feature = position_feature.to(gri_feat.dtype)
        # position_feature = position_feature.to(device)
        # position_feature = position_feature.reshape([B,-1,1024])
        # position_feature = self.position_embeding(position_feature)
        # position_feature = position_feature.reshape(B, h, 3*w, c)
        # outputs['gri_feat'] = (gri_feat + position_feature).reshape([B,-1,1024])
        

        # outputs['gri_feat'] = self.self_att(gri_feat,gri_feat,gri_feat)

        outputs['gri_mask'] = repeat(masks[-1], 'b h w -> b 1 1 (h w)').reshape(B,1,1,-1) 

        if self.use_reg_feat:
            srcs = [self.input_proj[l](src) for l, src in enumerate(features)]
            hs, _, _ = self.det_module(srcs, masks)
            reg_feat = hs[-1].reshape(B,-1,512)
            outputs['reg_feat'] = self.self_att2(reg_feat,reg_feat,reg_feat)

            outputs['reg_mask'] = hs[-1].data.new_full((hs[-1].shape[0], 1, 1, hs[-1].shape[1]), 0).bool().reshape(B,1,1,-1)
        return outputs


def build_detector(config):
    pos_dim = getattr(config.model.detector, 'pos_dim', None)
    backbone, _ = swin_base_win7_384(
        frozen_stages=config.model.frozen_stages,
        pos_dim=pos_dim,
    )
    det_cfg = config.model.detector
    det_module = build_det_module_with_config(det_cfg) if config.model.use_reg_feat else None #detector
    detector = Detector(
        backbone,
        det_module=det_module,
        hidden_dim=config.model.d_model,
        use_gri_feat=config.model.use_gri_feat,
        use_reg_feat=config.model.use_reg_feat,
    )
    if os.path.exists(config.model.detector.checkpoint):
        checkpoint = torch.load(config.model.detector.checkpoint, map_location='cpu')
        missing, unexpected = detector.load_state_dict(checkpoint['model'], strict=False)
        print(f"Loading weights for detector: missing: {len(missing)}, unexpected: {len(unexpected)}.")
    return detector
# ------------------------------------------------------------------------
# GRIT: Faster and Better Image captioning Transformer
# Licensed under the Creative Commons Attribution.
# ------------------------------------------------------------------------
# Modified from Meshed Memory Transformer
# https://github.com/aimagelab/meshed-memory-transformer
# ------------------------------------------------------------------------
import numpy as np
import torch
from torch import nn
from models.caption.containers import Module
from einops import rearrange, repeat


def init_params(module):
    for name, param in module.named_parameters():
        if 'weight' in name:
            nn.init.xavier_uniform_(param)
        elif 'bias' in name:
            nn.init.constant_(param, 0)
        elif 'm_' in name:  # for memory
            nn.init.normal_(param, mean=0, std=0.01)


class Attention(nn.Module):
    '''
    Scaled dot-product attention
    '''

    def __init__(self, d_model, n_heads, dropout=0.2, n_memories=0):
        super().__init__()
        self.fc_q = nn.Linear(d_model, d_model)
        self.fc_k = nn.Linear(d_model, d_model)
        self.fc_v = nn.Linear(d_model, d_model)
        self.fc_o = nn.Linear(d_model, d_model)

        # * adapted from Meshed-Memory Transformers; n_memories: # mem slots
        if n_memories > 0:
            self.m_k = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
            self.m_v = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))

        self.dropout = nn.Dropout(p=dropout)
        self.window_size = [6,18]
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), n_heads))  # 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        first_column = coords[1][:, 0]
        other_columns = coords[1][:, 1:]
        coords[1] = torch.cat((other_columns,first_column.view(-1,1)),dim=1)
        # m = 0
        # if 1/10 < m < 1/4:
        #     coords[1][:, 5] = coords[1][:, 6]
        #     coords[1][:, 11],coords[1][:, 12] = coords[1][:, 12],coords[1][:, 11].clone()
        # elif 1/4 < m < 5/12:
        #     coords[1][:, 4] = coords[1][:, 6]
        #     coords[1][:, 5] = coords[1][:, 7]
        #     coords[1][:, 10],coords[1][:, 12] = coords[1][:, 12],coords[1][:, 10].clone()
        #     coords[1][:, 11],coords[1][:, 13] = coords[1][:, 13],coords[1][:, 11].clone()
        coords = torch.stack((coords[0],coords[1]), dim=0)
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index", relative_position_index)

        self.d_model = d_model
        self.n_heads = n_heads
        self.n_memories = n_memories
        self.d_k = d_model // n_heads

        self.apply(init_params)

    def forward(self, q, k, v, attention_mask=None):
        # q, k, v: (b, n, d_model), mask: (b, n, n)
        nq, nk = q.shape[1], k.shape[1]

        if self.n_memories > 0:
            m_k = repeat(self.m_k, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.d_k)
            m_v = repeat(self.m_v, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.n_memories)
            q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)

            k = torch.cat([self.fc_k(k), m_k], 1)
            v = torch.cat([self.fc_v(v), m_v], 1)
            k = rearrange(k, 'b nk (head d) -> b head d nk', head=self.n_heads)
            v = rearrange(v, 'b nv (head d) -> b head nv d', head=self.n_heads)

            scores = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
            # if attention_weights is not None:
            # scores = torch.cat([scores[:, :, :, :nk] * attention_weights, scores[:, :, :, nk:]], dim=-1)
            if attention_mask is not None:
                scores[:, :, :, :nk] = scores[:, :, :, :nk].masked_fill(attention_mask.bool(), -np.inf)
        else:
            q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)
            k = rearrange(self.fc_k(k), 'b nk (head d) -> b head d nk', head=self.n_heads)
            v = rearrange(self.fc_v(v), 'b nv (head d) -> b head nv d', head=self.n_heads)

            scores = torch.matmul(q, k) / np.sqrt(self.d_k)  # [b h nq nk]
            # if attention_weights is not None:
            # scores = scores * attention_weights
            if attention_mask is not None:
                scores = scores.masked_fill(attention_mask.bool(), -np.inf)

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
        relative_position_bias =relative_position_bias.unsqueeze(0) 
        p_attn = torch.softmax(scores, -1)
        p_attn = self.dropout(p_attn)
        if p_attn.size(2) ==108:
            p_attn = p_attn + relative_position_bias
        # [b h nq nk] * [b h nk dv] = [b h nq dv] -> [b nq h dv] -> [b nq h*dv]
        out = rearrange(torch.matmul(p_attn, v), 'b h nq dv -> b nq (h dv)')

        out = self.fc_o(out)  # (b_s, nq, d_model)
        return out


class MemoryAttention(nn.Module):

    def __init__(self, d_model, n_heads, n_memories, dropout=0.0):
        # * adapted from Meshed-Memory Transformers; n_memories: # mem slots
        super().__init__()
        self.fc_q = nn.Linear(d_model, d_model)
        self.fc_k = nn.Linear(d_model, d_model)
        self.fc_v = nn.Linear(d_model, d_model)
        self.fc_o = nn.Linear(d_model, d_model)
        if n_memories > 0:
            self.m_k = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
            self.m_v = nn.Parameter(torch.FloatTensor(1, n_memories, d_model))
        self.dropout = nn.Dropout(p=dropout)

        self.d_model = d_model
        self.n_heads = n_heads
        self.n_memories = n_memories
        self.d_k = d_model // n_heads

        self.apply(init_params)

    def forward(self, q, k, v, attention_mask=None, attention_weights=None):
        # q, k, v: (b, n, d_model), mask: (b, n, n) - True indicates masking

        b_s, nq = q.shape[:2]
        nk = k.shape[1]
        if self.n_memories > 0:
            m_k = repeat(self.m_k, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.d_k)
            m_v = repeat(self.m_v, '() m d_model -> b m d_model', b=q.shape[0]) * np.sqrt(self.n_memories)
            q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)

            k = torch.cat([self.fc_k(k), m_k], 1)
            v = torch.cat([self.fc_v(v), m_v], 1)
            k = rearrange(k, 'b nk (head d) -> b head d nk', head=self.n_heads)
            v = rearrange(v, 'b nv (head d) -> b head nv d', head=self.n_heads)

            scores = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)
            if attention_weights is not None:
                scores = torch.cat([scores[:, :, :, :nk] * attention_weights, scores[:, :, :, nk:]], dim=-1)
            if attention_mask is not None:
                scores[:, :, :, :nk] = scores[:, :, :, :nk].masked_fill(attention_mask.bool(), -np.inf)
        else:
            q = rearrange(self.fc_q(q), 'b nq (head d) -> b head nq d', head=self.n_heads)
            k = rearrange(self.fc_k(k), 'b nk (head d) -> b head d nk', head=self.n_heads)
            v = rearrange(self.fc_v(v), 'b nv (head d) -> b head nv d', head=self.n_heads)

            scores = torch.matmul(q, k) / np.sqrt(self.d_k)  # [b h nq nk]
            if attention_weights is not None:
                scores = scores * attention_weights
            if attention_mask is not None:
                scores = scores.masked_fill(attention_mask.bool(), -np.inf)

        p_attn = torch.softmax(scores, dim=-1)
        p_attn = self.dropout(p_attn)

        # [b h nq nk] * [b h nk dv] = [b h nq dv] -> [b nq h dv] -> [b nq h*dv]
        out = rearrange(torch.matmul(p_attn, v), 'b h nq dv -> b nq (h dv)')
        out = self.fc_o(out)  # (b_s, nq, d_model)
        return out


class MultiHeadAttention(Module):

    def __init__(self, d_model, n_heads, dropout=.1, n_memories=0, can_be_stateful=False):
        super().__init__()

        self.attention = Attention(d_model=d_model, n_heads=n_heads, dropout=dropout, n_memories=n_memories)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

        self.can_be_stateful = can_be_stateful
        if self.can_be_stateful:  # store prev computed K & V for fast inference
            self.register_state('running_keys', torch.zeros((1, d_model)))
            self.register_state('running_values', torch.zeros((1, d_model)))

    def forward(self, queries, keys, values, attention_mask=None):
        if self.can_be_stateful and self._is_stateful:
            # keys, values:             from the current input token: [B, 1, D]
            # running_keys, values:     from prev tokens: [B, t-1, D]
            self.running_keys = torch.cat([self.running_keys, keys], 1)
            self.running_values = torch.cat([self.running_values, values], 1)
            if self.timestep == 0:
                keys = self.running_keys = self.running_keys[:, 1:]  # [B t D]
                values = self.running_values = self.running_values[:, 1:]  # [B t D]
            else:
                keys = self.running_keys  # [B t D]
                values = self.running_values  # [B t D]

            self.timestep += 1

        out = self.attention(queries, keys, values, attention_mask)
        out = self.dropout(out)
        out = self.layer_norm(queries + out)
        return out

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值