Pascal Sentences[1] 包含来自 VOC 2008[2] 的 1000
对图文对,多数图有 5
句描述(但不是全部),现参照 [3] 处理。
Data
数据在 [1],其中 label 是藏在 image 的链接路径中(可右键图片在新窗口打开看网址,或在浏览器 F12 打开调试查看元素)。
直接保存网页,会下载得到 pascal-sentences.htm
文件、pascal-sentences_files/
目录。其中,images 装在目录里,sentences 嵌在 html 文件中。
原本(在浏览器中调试)各 images 对应的 class name 也是嵌在 html 文件里对应的 image 路径中,但下载的 html 文件里 image 路径却变了(简单指向 pascal-sentences_files/
)。为得到 class name,需要在浏览器的页面中查看网页源码(右键 view page source),然后手动复制一份 html 文件,这里存为 page-source.html
。
所有文件都下在 pascal-sentences/
里。
Sample Order
- 按 image ID 的升序排列数据
- image 文件名形如:
2008_000032.jpg
,其中2008
应该是对应 VOC 2008,忽略;000032
就当成 image ID。 - 写入
id-map.pascal-sentences.txt
# make.id-map.py
import os
import os.path as osp
P = "G:/dataset/pascal-sentences"
IMAGE_P = osp.join(P, "pascal-sentences_files")
# format: `2008_000032.jpg`
id_key = lambda x: int(x.split(".jpg")[0].split("2008_")[1])
img_files = os.listdir(IMAGE_P)
img_files = sorted(img_files, key=id_key)
print(img_files[:10])
with open(osp.join(P, "id-map.pascal-sentences.txt"), "w") as f:
for sid, img_f in enumerate(img_files):
f.write("{} {}\n".format(sid, img_f))
Class Order and Labels
- 用
HTMLParser
解析 html 文件,参考 [4,5] - class 顺序按 class name 升序排,写入
class-name.pascal-sentences.txt
,为后面 class embedding 准备。 - labels 顺序同
id-map.pascal-sentences.txt
# make.label.py
import os
import os.path as osp
from html.parser import HTMLParser
import numpy as np
import scipy.io as sio
P = "G:/dataset/pascal-sentences"
HTML_F = osp.join(P, "page-source.html")
with open(HTML_F, "r") as f:
html_txt = f.readlines()
# print(html_txt)
html_txt = "".join(html_txt)
class ParserLabel(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.img_cls = {}
self.class_set = set()
def handle_starttag(self, tag, attrs):
if "img" != tag:
return
assert len(attrs) == 1
_cls, _img_f = attrs[0][1].split("/")
# print(_cls, _img_f)
self.class_set.add(_cls)
self.img_cls[_img_f] = _cls
print("parse annotations")
parser = ParserLabel()
parser.feed(html_txt)
print("class order")
class_set = sorted(list(parser.class_set))
cls_id = {}
with open(osp.join(P, "class-name.pascal-sentences.txt"), "w") as f:
for cid, c in enumerate(class_set):
f.write("{} {}\n".format(cid, c))
cls_id[c] = cid
print("read sample order")
img_id = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
for line in f:
line = line.strip()
if line:
sid, img_f = line.split()
img_id[img_f] = int(sid)
print("label")
assert len(parser.img_cls) == len(img_id)
N_DATA = len(parser.img_cls)
print("#data:", N_DATA) # 1000
labels = np.zeros([N_DATA], dtype=np.int32)
for img, c in parser.img_cls.items():
sid = img_id[img]
cid = cls_id[c]
labels[sid] = cid
# (1000,) 0 19 9.5 9500
print("labels:", labels.shape, labels.min(), labels.max(), labels.mean(), labels.sum())
sio.savemat(osp.join(P, "labels.pascal-sentences.mat"),
{"labels": labels}, do_compression=True)
Class Embeddings
- class name 的 Word2Vec feature,顺序同
class-name.pascal-sentences.txt
。 - 用到
GoogleNews-vectors-negative300.bin
,下载见 [6,7]。 - 替换了一些该 Word2Vec 模型没有的词(组),见下面代码;词组拆成多个词取平均。
# make.w2v.py
import os
import os.path as osp
import numpy as np
import scipy.io as sio
# from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
"""class Word2Vec embeddings
run `make.label.py` first
"""
P = "/home/dataset/pascal-sentences"
CLASS_NAME_F = osp.join(P, "class-name.pascal-sentences.txt")
# word2vec
MODEL = "/home/dataset/word2vec/GoogleNews-vectors-negative300.bin"
IN_DOCKER = True
USER_ID = 1000
cls_set = []
with open(CLASS_NAME_F, "r") as f:
for line in f:
line = line.strip()
if line:
cid, cn = line.split()
if "aeroplane" == cn:
cn = "airplane"
elif "diningtable" == cn:
cn = "dining_table"
elif "pottedplant" == cn:
cn = "potted_plant"
elif "tvmonitor" == cn:
cn = "TV_monitor"
cls_set.append(cn)
print("classes:", len(cls_set), cls_set)
w2v = KeyedVectors.load_word2vec_format(MODEL, binary=True)
print("find out the absent class names")
_file_name = osp.join(P, "absent-class-name.txt")
with open(_file_name, "w") as f:
for c in cls_set:
if c not in w2v:
print(c)
f.write("{}\n".format(c))
if IN_DOCKER:
os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))
"""absent class names
aeroplane -> airplane
diningtable -> dining_table
pottedplant -> potted_plant
tvmonitor -> TV_monitor
"""
print("class embedding")
class_emb = []
for c in cls_set:
if c in w2v:
class_emb.append(w2v[c])
else:
assert "_" in c, "absent single word: {}".format(c)
c_list = c.split("_")
tmp = 0
for _c in c_list:
tmp = tmp + w2v[_c]
tmp /= len(c_list)
class_emb.append(tmp)
class_emb = np.vstack(class_emb).astype(np.float32)
# (20, 300) -0.62109375 0.62890625 -0.010865917 -65.1955
print("class emb:", class_emb.shape, class_emb.min(), class_emb.max(), class_emb.mean(), class_emb.sum())
_file_name = osp.join(P, "class_emb.pascal-sentences.Gnews-300d.mat")
sio.savemat(_file_name, {"class_emb": class_emb})
# because I run this script in a docker container,
# I shall change the owership & group of this file for convenience
os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))
Images
- images 的 VGG 19 feature
- 顺序同
id-map.pascal-sentences.txt
# make.image.py
import os
import os.path as osp
import numpy as np
import scipy.io as sio
# import h5py
import cv2
from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transforms
P = "/home/dataset/pascal-sentences"
IMAGE_P = osp.join(P, "pascal-sentences_files")
BATCH_SIZE = 100
IN_DOCKER = True
USER_ID = 1000
print("read sample order")
id_img = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
for line in f:
line = line.strip()
if line:
sid, img_f = line.split()
id_img[int(sid)] = img_f
N = len(id_img)
print("#image:", N)
model = models.vgg19(pretrained=True)
model.classifier = model.classifier[:-2]
model = model.cuda()
trsf = transforms.Compose([
# transforms.Resize(224),
transforms.ToTensor(),
transforms.Normalize(
[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
fea_list = []
with torch.no_grad():
for i in range(0, N, BATCH_SIZE):
image_batch = []
for sid in range(i, min(i + BATCH_SIZE, N)):
img_p = osp.join(IMAGE_P, id_img[sid])
img = cv2.imread(img_p)#[:, :, ::-1]
if img is None:
img_f = Image.open(img_p)
img = np.asarray(img_f)
img_f.close()
if 2 == img.ndim:
img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
else:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
img = trsf(Image.fromarray(img))
# print(img.size())
image_batch.append(img.unsqueeze(0))
image_batch = torch.cat(image_batch, 0)
# print(image_batch.size())
image_batch = image_batch.cuda()
fea = model(image_batch).cpu().numpy()
# print(fea.shape)
fea_list.append(fea)
print(i)
# if i > 0: break
Fea = np.vstack(fea_list).astype(np.float32)
print("image features:", Fea.shape)
_f_name = osp.join(P, "images.pascal-sentences.vgg19.{}d.mat".format(Fea.shape[1]))
sio.savemat(_f_name, {"images": Fea}, do_compression=True)
if IN_DOCKER:
os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))
Texts
- (2022.7.11 Notes)[8] 中文本的预处理方法已改为 Stanford CoreNLP 分词 + lowercase,但本文的还是用
gensim.utils.simple_preprocess
的旧方法,代码未更新(flag:以后有空更),请酌情参考 [8] 的新方法重制。 - 参照 [8],5 句话拼在一起,提取 Doc2Vec features。
- 本节所用环境同 [8] 的对应一节:[11] 的容器创建 python 2.7 的虚拟环境 + [9] 的预训练 Doc2Vec 模型 + [10] 的旧版 gensim。
- python 2 的
HTMLParser
包名不同 python 3,见 [12] 和下面代码。
# make.text.py
from __future__ import print_function
import os
import os.path as osp
from HTMLParser import HTMLParser # python 2
# from html.parser import HTMLParser # python 3
import numpy as np
import scipy.io as sio
import gensim
from gensim.models import Doc2Vec
"""text Doc2Vec feature
run `make.id-map.py` before this file
"""
P = "/home/dataset/pascal-sentences"
HTML_F = osp.join(P, "page-source.html")
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
DIM = 300 # dimension of the doc2vec feature
IN_DOCKER = True
USER_ID = 1000
with open(HTML_F, "r") as f:
html_txt = f.readlines()
# print(html_txt)
html_txt = "".join(html_txt)
class ParserText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tr_layer = 0
self.current_img = None
self.current_txt = []
self.current_tag = None
self.img_txt = {}
def handle_starttag(self, tag, attrs):
self.current_tag = tag
if "img" == tag:
assert len(attrs) == 1
_cls, _img_f = attrs[0][1].split("/")
# print(_cls, _img_f)
self.current_img = _img_f
elif "tr" == tag:
self.tr_layer += 1
def handle_endtag(self, tag):
if "tr" == tag:
self.tr_layer -= 1
if 0 == self.tr_layer:
# assert 5 == len(self.current_txt)
self.current_txt = "".join(self.current_txt)
self.img_txt[self.current_img] = self.current_txt
# print('\t', self.current_img, '\n', self.current_txt)
self.current_img = None
self.current_txt = []
self.current_tag = None
def handle_data(self, data):
if (2 == self.tr_layer) and ("td" == self.current_tag):
# print(data)
self.current_txt.append(data)
print("parse sentences")
parser = ParserText()
parser.feed(html_txt)
print("read sample order")
img_id = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
for line in f:
line = line.strip()
if line:
sid, img_f = line.split()
img_id[img_f] = int(sid)
print("text")
assert len(parser.img_txt) == len(img_id)
N_DATA = len(parser.img_txt)
print("#data:", N_DATA) # 1000
texts = np.zeros([N_DATA, DIM], dtype=np.float32)
model = Doc2Vec.load(MODEL)
for img, txt in parser.img_txt.items():
sid = img_id[img]
doc = gensim.utils.simple_preprocess(doc)
vec = model.infer_vector(doc)
texts[sid] = vec
# (1000, 300) -0.6507467 0.6664893 -0.0071584913 -2147.5474
print("texts:", texts.shape, texts.min(), texts.max(), texts.mean(), texts.sum())
_f_name = osp.join(P, "texts.pascal-sentences.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_name, {"texts": texts})
if IN_DOCKER:
os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))
Cloud Drive
百度网盘:https://pan.baidu.com/s/1QfyhxPLjPfQS5JdHWh4HTQ,提取码:lwbd
。
References
- Pascal Sentences
- Visual Object Classes Challenge 2008 (VOC2008)
- TCSVT 2020 | Zero-Shot Cross-Media Embedding Learning With Dual Adversarial Distribution Network
- html.parser — 简单的 HTML 和 XHTML 解析器 <- python 3
- Python HTML操作(HTMLParser)
- GoogleNews-vectors-negative300.bin.gz
- nishankmahore/word2vec-flask-api
- MS COCO 2017数据集预处理
- jhlau/doc2vec
- jhlau/gensim
- pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
- HTMLParser — Simple HTML and XHTML parser <- python 2