NUS-WIDE的Doc2Vec text feature及925 tags的子集

之前 NUS-WIDE 的数据见 [1],现给其 texts 处理一份 Doc2Vec 特征([2] 也有这么做),会用到 [1] 中的一些文件。Doc2Vec 环境准备见 [3]。

按 [4],NUS-WIDE 的 1000 个 tags 中,有 75 个是出现在那 81 个类中的,即重复了。此处 doc2vec 分做两份:

  1. 用完整的 1000 个 tags 做 doc2vec features;
  2. 去掉那 75 个重复的,用剩下的 925 个 tags 做,同时重新筛一份 clean id 和划分一次数据集。

Code

  • 按 [3],此处环境是 python 2.7
from __future__ import print_function
import os
import os.path as osp
import io
import pprint
# from pycocotools.coco import COCO
import gensim
from gensim.models import Doc2Vec
import numpy as np
import scipy.io as sio


# 一些全局变量
P = "/home/dataset/nuswide"
TAG_F = osp.join(P, "NUS_WID_Tags", "TagList1k.txt")
CLASS_F = osp.join(P, "Concepts81.txt")
LABEL_F = osp.join(P, "labels.npy")
TEXT_F = osp.join(P, "texts.AllTags1k.npy")
CLEAN_ID_F = osp.join(P, "clean_id.81.AllTags1k.npy")

# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300  # dimension of the doc2vec feature

# 因为在 docker container 内,需要将生成的文件用 chown 改成宿主 user 所有
USER_ID = 1000

previous files

  • NUS-WIDE 的原始 tag list、class list,和之前 [1] 中处理的数据
print("完整的 1000 个 tags")
tag_id = {}
with io.open(TAG_F, "r", encoding='utf-8') as f:
    for tid, line in enumerate(f):
        tn = line.strip()
        if tn:
            tag_id[tn] = tid
print("#tags:", len(tag_id))  # 1000
id_tag = {tid: t for t, tid in tag_id.items()}

print("81 个类")
cls_id = {}
with open(CLASS_F, "r") as f:
    for cid, line in enumerate(f):
        cn = line.strip()
        if cn:
            cls_id[cn] = cid
print("#class:", len(cls_id))  # 81

print("之前处理的 texts、labels、clean_id")
texts = np.load(TEXT_F).astype(np.int32)
print("texts:", texts.shape, texts.dtype)
labels = np.load(LABEL_F).astype(np.int32)
print("labels:", labels.shape, labels.dtype)
clean_id = np.load(CLEAN_ID_F)
print("original clean_id:", clean_id.shape)  # (203598,)

925-tags subset

print("tags 和 class 重合的部分")
overlap = set(list(tag_id.keys())).intersection(set(list(cls_id.keys())))
print("overlap:", len(overlap), overlap)
overlap_tid = np.asarray([tag_id[t] for t in overlap])
print("overlap tag id:", overlap_tid.shape)  # 75

print("925 tags 子集")
print("cleaned tags that have NO overlap with class labels")
texts_no = texts.copy()
for t in overlap_tid:
    texts_no[:, t] = 0
print("original text cardinality:", texts.sum())  # 1559464
print("cleaned text cardinality:", texts_no.sum())  # 1328870

new clean id

  • 在原来 clean_id.81.AllTags1k.npy 的基础上,筛掉 925-tags 子集上 text 为空的部分
  • 比原来少了 2620
print("new clean id derived from the old one and `texts_no`")
clean_id_no = []
for sid in clean_id:
    if texts_no[sid].sum() > 0:
        clean_id_no.append(sid)

clean_id_no = np.asarray(clean_id_no)
print("new clean_id:", clean_id_no.shape)  # (200978,)
print(len(clean_id) - len(clean_id_no), "data less")  # 2620
_f_new_clean_id = osp.join(P, "clean_id.81.925tags.mat")
sio.savemat(_f_new_clean_id, {"clean_id": clean_id_no})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_new_clean_id))

new splitting

print("new splitting based on the new clean id")
N_TEST = 2000
N_TRAIN = 10000

# use new clean id: `clean_id_no`
indices = np.random.permutation(clean_id_no.shape[0]).astype(np.int32)
idx_test = indices[:N_TEST]
idx_ret = indices[N_TEST:]
idx_labeled = idx_ret[:N_TRAIN]
idx_unlabeled = idx_ret[N_TRAIN:]
# (2000,) (198978,) (10000,) (188978,)
print(idx_test.shape, idx_ret.shape, idx_labeled.shape, idx_unlabeled.shape)
_f_split = osp.join(P, "split.nuswide.2k.10k.925tags.mat")
sio.savemat(_f_split, {
    "idx_test": idx_test,
    "idx_labeled": idx_labeled,
    "idx_unlabeled": idx_unlabeled,
    "idx_ret": idx_ret,
})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_split))

doc2vec features

  • 做了两份:完整 1000 tags 的、925 tags 的
# pre-trained Doc2Vec model
model = Doc2Vec.load(MODEL)
zero_emb = np.zeros([300], dtype=np.float32)
# print(zero_emb.shape)


def infer_vec(text):
    sentence = [id_tag[i] for i in range(len(text)) if (text[i] > 0)]
    assert len(sentence) == text.sum()
    if len(sentence) > 0:
        # doc = gensim.utils.simple_preprocess(" ".join(sentence))
        doc = sentence
        vec = model.infer_vector(doc)
    else:
        vec = zero_emb
    return vec


d2v_origin, d2v_no = [], []
for i, (tx_origin, tx_no) in enumerate(zip(texts, texts_no)):
    # print(tx_origin.shape, tx_origin.sum())

    # 1. original text
    vec_origin = infer_vec(tx_origin)
    d2v_origin.append(vec_origin[np.newaxis, :])

    # 2. text with no overlap
    vec_no = infer_vec(tx_no)
    d2v_no.append(vec_no[np.newaxis, :])

    if i % 1000 == 0:
        print(i)
    # break

d2v_origin = np.vstack(d2v_origin)
print("original d2v:", d2v_origin.shape, d2v_origin.dtype)  # (269648, 300) float32
d2v_no = np.vstack(d2v_no)
print("d2v with NO overlap:", d2v_no.shape, d2v_no.dtype)  # (269648, 300) float32

_f_origin = osp.join(P, "texts.AllTags1k.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_origin, {"texts": d2v_origin})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_origin))

# ref: Fast zero-shot image tagging
# with the `75` duplicated tags removed, there're `925` tags left
_f_no = osp.join(P, "texts.925tags.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_no, {"texts": d2v_no})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_no))

Referenes

  1. NUS-WIDE数据集预处理
  2. Correlated Features Synthesis and Alignment for Zero-shot Cross-modal Retrieval
  3. MS COCO 2017数据集预处理
  4. Fast zero-shot image tagging
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值