之前 NUS-WIDE 的数据见 [1],现给其 texts 处理一份 Doc2Vec 特征([2] 也有这么做),会用到 [1] 中的一些文件。Doc2Vec 环境准备见 [3]。
按 [4],NUS-WIDE 的 1000
个 tags 中,有 75
个是出现在那 81
个类中的,即重复了。此处 doc2vec 分做两份:
- 用完整的
1000
个 tags 做 doc2vec features; - 去掉那
75
个重复的,用剩下的925
个 tags 做,同时重新筛一份 clean id 和划分一次数据集。
Code
- 按 [3],此处环境是 python 2.7
from __future__ import print_function
import os
import os.path as osp
import io
import pprint
# from pycocotools.coco import COCO
import gensim
from gensim.models import Doc2Vec
import numpy as np
import scipy.io as sio
# 一些全局变量
P = "/home/dataset/nuswide"
TAG_F = osp.join(P, "NUS_WID_Tags", "TagList1k.txt")
CLASS_F = osp.join(P, "Concepts81.txt")
LABEL_F = osp.join(P, "labels.npy")
TEXT_F = osp.join(P, "texts.AllTags1k.npy")
CLEAN_ID_F = osp.join(P, "clean_id.81.AllTags1k.npy")
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300 # dimension of the doc2vec feature
# 因为在 docker container 内,需要将生成的文件用 chown 改成宿主 user 所有
USER_ID = 1000
previous files
- NUS-WIDE 的原始 tag list、class list,和之前 [1] 中处理的数据
print("完整的 1000 个 tags")
tag_id = {}
with io.open(TAG_F, "r", encoding='utf-8') as f:
for tid, line in enumerate(f):
tn = line.strip()
if tn:
tag_id[tn] = tid
print("#tags:", len(tag_id)) # 1000
id_tag = {tid: t for t, tid in tag_id.items()}
print("81 个类")
cls_id = {}
with open(CLASS_F, "r") as f:
for cid, line in enumerate(f):
cn = line.strip()
if cn:
cls_id[cn] = cid
print("#class:", len(cls_id)) # 81
print("之前处理的 texts、labels、clean_id")
texts = np.load(TEXT_F).astype(np.int32)
print("texts:", texts.shape, texts.dtype)
labels = np.load(LABEL_F).astype(np.int32)
print("labels:", labels.shape, labels.dtype)
clean_id = np.load(CLEAN_ID_F)
print("original clean_id:", clean_id.shape) # (203598,)
925-tags subset
print("tags 和 class 重合的部分")
overlap = set(list(tag_id.keys())).intersection(set(list(cls_id.keys())))
print("overlap:", len(overlap), overlap)
overlap_tid = np.asarray([tag_id[t] for t in overlap])
print("overlap tag id:", overlap_tid.shape) # 75
print("925 tags 子集")
print("cleaned tags that have NO overlap with class labels")
texts_no = texts.copy()
for t in overlap_tid:
texts_no[:, t] = 0
print("original text cardinality:", texts.sum()) # 1559464
print("cleaned text cardinality:", texts_no.sum()) # 1328870
new clean id
- 在原来 clean_id.81.AllTags1k.npy 的基础上,筛掉 925-tags 子集上 text 为空的部分
- 比原来少了
2620
个
print("new clean id derived from the old one and `texts_no`")
clean_id_no = []
for sid in clean_id:
if texts_no[sid].sum() > 0:
clean_id_no.append(sid)
clean_id_no = np.asarray(clean_id_no)
print("new clean_id:", clean_id_no.shape) # (200978,)
print(len(clean_id) - len(clean_id_no), "data less") # 2620
_f_new_clean_id = osp.join(P, "clean_id.81.925tags.mat")
sio.savemat(_f_new_clean_id, {"clean_id": clean_id_no})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_new_clean_id))
new splitting
print("new splitting based on the new clean id")
N_TEST = 2000
N_TRAIN = 10000
# use new clean id: `clean_id_no`
indices = np.random.permutation(clean_id_no.shape[0]).astype(np.int32)
idx_test = indices[:N_TEST]
idx_ret = indices[N_TEST:]
idx_labeled = idx_ret[:N_TRAIN]
idx_unlabeled = idx_ret[N_TRAIN:]
# (2000,) (198978,) (10000,) (188978,)
print(idx_test.shape, idx_ret.shape, idx_labeled.shape, idx_unlabeled.shape)
_f_split = osp.join(P, "split.nuswide.2k.10k.925tags.mat")
sio.savemat(_f_split, {
"idx_test": idx_test,
"idx_labeled": idx_labeled,
"idx_unlabeled": idx_unlabeled,
"idx_ret": idx_ret,
})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_split))
doc2vec features
- 做了两份:完整
1000
tags 的、925
tags 的
# pre-trained Doc2Vec model
model = Doc2Vec.load(MODEL)
zero_emb = np.zeros([300], dtype=np.float32)
# print(zero_emb.shape)
def infer_vec(text):
sentence = [id_tag[i] for i in range(len(text)) if (text[i] > 0)]
assert len(sentence) == text.sum()
if len(sentence) > 0:
# doc = gensim.utils.simple_preprocess(" ".join(sentence))
doc = sentence
vec = model.infer_vector(doc)
else:
vec = zero_emb
return vec
d2v_origin, d2v_no = [], []
for i, (tx_origin, tx_no) in enumerate(zip(texts, texts_no)):
# print(tx_origin.shape, tx_origin.sum())
# 1. original text
vec_origin = infer_vec(tx_origin)
d2v_origin.append(vec_origin[np.newaxis, :])
# 2. text with no overlap
vec_no = infer_vec(tx_no)
d2v_no.append(vec_no[np.newaxis, :])
if i % 1000 == 0:
print(i)
# break
d2v_origin = np.vstack(d2v_origin)
print("original d2v:", d2v_origin.shape, d2v_origin.dtype) # (269648, 300) float32
d2v_no = np.vstack(d2v_no)
print("d2v with NO overlap:", d2v_no.shape, d2v_no.dtype) # (269648, 300) float32
_f_origin = osp.join(P, "texts.AllTags1k.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_origin, {"texts": d2v_origin})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_origin))
# ref: Fast zero-shot image tagging
# with the `75` duplicated tags removed, there're `925` tags left
_f_no = osp.join(P, "texts.925tags.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_no, {"texts": d2v_no})
os.system("chown {0}:{0} {1}".format(USER_ID, _f_no))