NUS-WIDE数据集划分


NUS-WIDE[1]是多标签数据集,看到几篇都是类似 [1] 的划分方式:每个类随机选 100 个造成 query set。感觉有些谜,问 DCMH 作者,见 [3]。
现在的策略是:按类来抽,保证每个类的样本数据,而且不放回,保证不重复。(莫非本来就是这个意思?)
我还根据每个类的样本数,从少到多地选,虽然现在看来似乎没必要。
用的数据也是 DCMH 作者提供的,见 [3] 所在 repo。或见 [4]。

Code

spliting

对于 semi-supervised:training set = labeled part,labeled + unlabeled = retrieval set。
test set 跟 query set 同义。

import numpy as np
import scipy.io as sio
import os
from os.path import join
import time
import matplotlib.pyplot as plt

np.random.seed(int(time.time()))

# 读 label 数据
NUSWIDE = "/usr/local/dataset/nuswide-tc21"
labels = sio.loadmat(join(NUSWIDE, "nus-wide-tc21-lall.mat"))["LAll"]
print(labels.shape)  # (195834, 21)


N_CLASS = labels.shape[1]
N_SAMPLE = labels.shape[0]
TEST_PER = 100  # test set 每个类 100 个
TRAIN_PER = 500  # training set 每个类 500 个
N_TEST = TEST_PER * N_CLASS
N_TRAIN = TRAIN_PER * N_CLASS


"""1. 先保证 test set 的每类至少 100"""
indices = list(range(N_SAMPLE))  # 全部索引
np.random.shuffle(indices)

cls_sum = np.sum(labels[indices], axis=0)  # 统计每个类样本数
#print(cls_sum)
classes = np.argsort(cls_sum)  # 从少到多
#print(classes)

id_test = []
cnt = np.zeros_like(labels[0], dtype=np.int32)  # 默认 int8,会爆
for cls in classes:
    print("--- {} ---".format(cls))
    for i in indices:
        if cnt[cls] >= TEST_PER:  # 此类已抽够
            break
        if labels[i][cls] == 1:
            id_test.append(i)
            cnt += labels[i]
    #print(cnt)
    assert cnt[cls] >= TEST_PER  # 讲道理一趟下来是肯定够的
    indices = list(set(indices) - set(id_test))  # 去掉已抽部分的 id
    np.random.shuffle(indices)
    #print("left:", len(indices))

assert len(set(id_test)) == len(id_test)  # 验证没有重复
#print("cnt:", cnt)
print("#test:", len(id_test))


"""2. 类似地,保证 training set 的每类至少 500"""
indices = list(set(indices) - set(id_test))  # 去掉刚才选过的那些 test id
np.random.shuffle(indices)
print(len(indices))

cls_sum = np.sum(labels[indices], axis=0)
#print(cls_sum)
classes = np.argsort(cls_sum)
#print(classes)

id_train = []
cnt = np.zeros_like(labels[0], dtype=np.int32)
for cls in classes:
    print("--- {} ---".format(cls))
    for i in indices:
        if cnt[cls] >= TRAIN_PER:
            break
        if labels[i][cls] == 1:
            id_train.append(i)
            cnt += labels[i]
    #print(cnt)
    assert cnt[cls] >= TRAIN_PER
    indices = list(set(indices) - set(id_train))
    np.random.shuffle(indices)
    #print("left:", len(indices))

assert len(set(id_train)) == len(id_train)
#print("cnt:", cnt)
print("#train:", len(id_train))


"""3. 补足 test 和 training set 剩余的部分"""
indices = list(set(indices) - set(id_train))  # 再去掉刚才选过的 train id
np.random.shuffle(indices)
#print(len(indices))

lack_test = N_TEST - len(id_test)
lack_train = N_TRAIN - len(id_train)
print("lack:", lack_test, ",", lack_train)

id_test.extend(indices[:lack_test])
id_train.extend(indices[lack_test: lack_test + lack_train])

print("#total test:", len(id_test))
print("#total train:", len(id_train))


"""4. unlabeled 部分"""
# unlabeled = all - labeled(training) - query(test)
id_unlabeled = list(set(indices) - set(id_train) - set(id_test))
print("#unlabeled:", len(id_unlabeled))


"""5. retrieval set"""
id_ret = id_train + id_unlabeled
print("#retrieval:", len(id_ret))


"""保存"""
_info = "nuswide-tc21.{}pc.{}pc".format(TEST_PER, TRAIN_PER)
SAV_P = join(NUSWIDE, _info)
if not os.path.exists(SAV_P):
    os.makedirs(SAV_P)

test_id = np.asarray(id_test)
labeled_id = np.asarray(id_train)
unlabeled_id = np.asarray(id_unlabeled)
ret_id = np.asarray(id_ret)

np.save(join(SAV_P, "idx_test.npy"), test_id)
np.save(join(SAV_P, "idx_labeled.npy"), labeled_id)
np.save(join(SAV_P, "idx_unlabeled.npy"), unlabeled_id)
np.save(join(SAV_P, "idx_ret.npy"), ret_id)

image mean

  • 计算两种图像均值:按 pixel 平均、按 channel 平均。
  • 图像来自 nus-wide-tc21-iall.mat,前期将其按 id 分开每幅一个 .npy 文件,放在 image.npy/ 里。
"""计算图像均值:按 pixel、按 channel 两种"""

IMAGE_P = join(NUSWIDE, "images.npy")
_img = np.load(join(IMAGE_P, "1.npy"))
mean_pix = np.zeros_like(_img).astype(np.float32)  # [224, 224, 0]
mean_channel = np.zeros([3]).astype(np.float32)

for i, idx in enumerate(ret_id):
    img = np.load(join(IMAGE_P, "{}.npy".format(idx)))
    mean_pix += img
    mean_channel += np.mean(img, (0, 1))
    if i % 1000 == 0 or i == ret_id.shape[0] - 1:
        print(i)

mean_pix /= ret_id.shape[0]
mean_channel /= ret_id.shape[0]
print("mean channel:", mean_channel)  # [111.84164 107.72994  99.7127 ]

np.save(join(SAV_P, "avgpix.{}.npy".format(_info)), mean_pix)
np.save(join(SAV_P, "avgc.{}.npy".format(_info)), mean_channel)

References

  1. NUS-WIDE
  2. Simultaneous Feature Learning and Hash Coding with Deep Neural Networks
  3. details of partition of NUS-WIDE #8
  4. NUS-WIDE数据集预处理
以下是在 Windows 系统中使用 PyTorch 实现使用 ResNet 提取 NUS-WIDE 数据集特征并将其保存到图像和标签文件中的步骤: 1. 下载 NUS-WIDE 数据集并解压缩到指定文件夹。 2. 安装 PyTorch 和 torchvision 库。 3. 创建一个 Python 脚本,并导入必要的库: ```python import os import torch import torchvision.models as models import torchvision.transforms as transforms from PIL import Image import numpy as np import pandas as pd ``` 4. 加载预训练的 ResNet 模型: ```python model = models.resnet50(pretrained=True) ``` 5. 我们需要使用 ResNet 中的最后一个卷积层的输出作为特征向量。因此,我们需要创建一个新的模型,该模型只包含 ResNet 的前面的层和最后一个卷积层: ```python model = torch.nn.Sequential(*(list(model.children())[:-1])) ``` 6. 加载 NUS-WIDE 数据集的图像,并将其转换为模型所需的格式: ```python transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) def load_image(image_path): image = Image.open(image_path) image = transform(image) image = image.unsqueeze(0) return image ``` 7. 加载数据集的标签: ```python labels = pd.read_csv("NUS-WIDE-urls/NUS-WIDE-urls.txt", sep="\t", header=None, usecols=[1]) ``` 8. 遍历数据集中的所有图像,并使用 ResNet 提取其特征向量: ```python features = [] for i, filename in enumerate(os.listdir("NUS-WIDE-urls/images")): image_path = os.path.join("NUS-WIDE-urls/images", filename) image = load_image(image_path) output = model(image) feature = output.detach().numpy().squeeze() features.append(feature) ``` 9. 将特征向量保存到 numpy 数组中: ```python features = np.array(features) np.save("features.npy", features) ``` 10. 将标签保存到 CSV 文件中: ```python labels.to_csv("labels.csv", index=False, header=False) ``` 这样,就可以在 Windows 条件下使用 PyTorch 和 ResNet 提取 NUS-WIDE 数据集的特征,并将它们保存到图像和标签文件中。
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值