Pascal Sentences数据集预处理

HackerTom

已于 2022-07-11 21:00:48 修改

阅读量1.7k

点赞数 5

分类专栏：机器学习文章标签： Pascal Sentence VGG19 Word2Vec Doc2Vec HTMLParser

于 2021-11-24 22:39:11 首次发布

本文链接：https://blog.csdn.net/HackerTom/article/details/121525787

版权

机器学习专栏收录该内容

119 篇文章 16 订阅

订阅专栏

Pascal Sentences^[1] 包含来自 VOC 2008^[2] 的 1000 对图文对，多数图有 5 句描述（但不是全部），现参照 [3] 处理。

Data

数据在 [1]，其中 label 是藏在 image 的链接路径中（可右键图片在新窗口打开看网址，或在浏览器 F12 打开调试查看元素）。

直接保存网页，会下载得到 pascal-sentences.htm 文件、pascal-sentences_files/ 目录。其中，images 装在目录里，sentences 嵌在 html 文件中。

原本（在浏览器中调试）各 images 对应的 class name 也是嵌在 html 文件里对应的 image 路径中，但下载的 html 文件里 image 路径却变了（简单指向 pascal-sentences_files/）。为得到 class name，需要在浏览器的页面中查看网页源码（右键 view page source），然后手动复制一份 html 文件，这里存为 page-source.html。

所有文件都下在 pascal-sentences/ 里。

Sample Order

按 image ID 的升序排列数据
image 文件名形如：2008_000032.jpg，其中 2008 应该是对应 VOC 2008，忽略；000032 就当成 image ID。
写入 id-map.pascal-sentences.txt

# make.id-map.py
import os
import os.path as osp


P = "G:/dataset/pascal-sentences"
IMAGE_P = osp.join(P, "pascal-sentences_files")


# format: `2008_000032.jpg`
id_key = lambda x: int(x.split(".jpg")[0].split("2008_")[1])

img_files = os.listdir(IMAGE_P)
img_files = sorted(img_files, key=id_key)
print(img_files[:10])

with open(osp.join(P, "id-map.pascal-sentences.txt"), "w") as f:
    for sid, img_f in enumerate(img_files):
        f.write("{} {}\n".format(sid, img_f))

Class Order and Labels

用 HTMLParser 解析 html 文件，参考 [4,5]
class 顺序按 class name 升序排，写入 class-name.pascal-sentences.txt，为后面 class embedding 准备。
labels 顺序同 id-map.pascal-sentences.txt

# make.label.py
import os
import os.path as osp
from html.parser import HTMLParser
import numpy as np
import scipy.io as sio


P = "G:/dataset/pascal-sentences"
HTML_F = osp.join(P, "page-source.html")


with open(HTML_F, "r") as f:
    html_txt = f.readlines()
# print(html_txt)
html_txt = "".join(html_txt)


class ParserLabel(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.img_cls = {}
        self.class_set = set()

    def handle_starttag(self, tag, attrs):
        if "img" != tag:
            return
        assert len(attrs) == 1
        _cls, _img_f = attrs[0][1].split("/")
        # print(_cls, _img_f)
        self.class_set.add(_cls)
        self.img_cls[_img_f] = _cls


print("parse annotations")
parser = ParserLabel()
parser.feed(html_txt)

print("class order")
class_set = sorted(list(parser.class_set))
cls_id = {}
with open(osp.join(P, "class-name.pascal-sentences.txt"), "w") as f:
    for cid, c in enumerate(class_set):
        f.write("{} {}\n".format(cid, c))
        cls_id[c] = cid


print("read sample order")
img_id = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
    for line in f:
        line = line.strip()
        if line:
            sid, img_f = line.split()
            img_id[img_f] = int(sid)


print("label")
assert len(parser.img_cls) == len(img_id)
N_DATA = len(parser.img_cls)
print("#data:", N_DATA)  # 1000
labels = np.zeros([N_DATA], dtype=np.int32)
for img, c in parser.img_cls.items():
    sid = img_id[img]
    cid = cls_id[c]
    labels[sid] = cid

# (1000,) 0 19 9.5 9500
print("labels:", labels.shape, labels.min(), labels.max(), labels.mean(), labels.sum())
sio.savemat(osp.join(P, "labels.pascal-sentences.mat"),
    {"labels": labels}, do_compression=True)

Class Embeddings

class name 的 Word2Vec feature，顺序同 class-name.pascal-sentences.txt。
用到 GoogleNews-vectors-negative300.bin，下载见 [6,7]。
替换了一些该 Word2Vec 模型没有的词(组)，见下面代码；词组拆成多个词取平均。

# make.w2v.py
import os
import os.path as osp
import numpy as np
import scipy.io as sio
# from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors


"""class Word2Vec embeddings
run `make.label.py` first
"""


P = "/home/dataset/pascal-sentences"
CLASS_NAME_F = osp.join(P, "class-name.pascal-sentences.txt")

# word2vec
MODEL = "/home/dataset/word2vec/GoogleNews-vectors-negative300.bin"

IN_DOCKER = True
USER_ID = 1000

cls_set = []
with open(CLASS_NAME_F, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            cid, cn = line.split()
            if "aeroplane" == cn:
                cn = "airplane"
            elif "diningtable" == cn:
                cn = "dining_table"
            elif "pottedplant" == cn:
                cn = "potted_plant"
            elif "tvmonitor" == cn:
                cn = "TV_monitor"
            cls_set.append(cn)
print("classes:", len(cls_set), cls_set)


w2v = KeyedVectors.load_word2vec_format(MODEL, binary=True)


print("find out the absent class names")
_file_name = osp.join(P, "absent-class-name.txt")
with open(_file_name, "w") as f:
    for c in cls_set:
        if c not in w2v:
            print(c)
            f.write("{}\n".format(c))
if IN_DOCKER:
    os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))


"""absent class names
aeroplane -> airplane
diningtable -> dining_table
pottedplant -> potted_plant
tvmonitor -> TV_monitor
"""


print("class embedding")
class_emb = []
for c in cls_set:
    if c in w2v:
        class_emb.append(w2v[c])
    else:
        assert "_" in c, "absent single word: {}".format(c)
        c_list = c.split("_")
        tmp = 0
        for _c in c_list:
            tmp = tmp + w2v[_c]
        tmp /= len(c_list)
        class_emb.append(tmp)

class_emb = np.vstack(class_emb).astype(np.float32)
# (20, 300) -0.62109375 0.62890625 -0.010865917 -65.1955
print("class emb:", class_emb.shape, class_emb.min(), class_emb.max(), class_emb.mean(), class_emb.sum())

_file_name = osp.join(P, "class_emb.pascal-sentences.Gnews-300d.mat")
sio.savemat(_file_name, {"class_emb": class_emb})
# because I run this script in a docker container,
# I shall change the owership & group of this file for convenience
os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))

Images

images 的 VGG 19 feature
顺序同 id-map.pascal-sentences.txt

# make.image.py
import os
import os.path as osp
import numpy as np
import scipy.io as sio
# import h5py
import cv2
from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transforms


P = "/home/dataset/pascal-sentences"
IMAGE_P = osp.join(P, "pascal-sentences_files")

BATCH_SIZE = 100

IN_DOCKER = True
USER_ID = 1000


print("read sample order")
id_img = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
    for line in f:
        line = line.strip()
        if line:
            sid, img_f = line.split()
            id_img[int(sid)] = img_f
N = len(id_img)
print("#image:", N)


model = models.vgg19(pretrained=True)
model.classifier = model.classifier[:-2]
model = model.cuda()

trsf = transforms.Compose([
    # transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(
        [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])


fea_list = []
with torch.no_grad():
    for i in range(0, N, BATCH_SIZE):
        image_batch = []
        for sid in range(i, min(i + BATCH_SIZE, N)):
            img_p = osp.join(IMAGE_P, id_img[sid])
            img = cv2.imread(img_p)#[:, :, ::-1]
            if img is None:
                img_f = Image.open(img_p)
                img = np.asarray(img_f)
                img_f.close()
                if 2 == img.ndim:
                    img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
            img = trsf(Image.fromarray(img))
            # print(img.size())
            image_batch.append(img.unsqueeze(0))

        image_batch = torch.cat(image_batch, 0)
        # print(image_batch.size())
        image_batch = image_batch.cuda()
        fea = model(image_batch).cpu().numpy()
        # print(fea.shape)
        fea_list.append(fea)

        print(i)
        # if i > 0: break

Fea = np.vstack(fea_list).astype(np.float32)
print("image features:", Fea.shape)
_f_name = osp.join(P, "images.pascal-sentences.vgg19.{}d.mat".format(Fea.shape[1]))
sio.savemat(_f_name, {"images": Fea}, do_compression=True)
if IN_DOCKER:
    os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

Texts

（2022.7.11 Notes）[8] 中文本的预处理方法已改为 Stanford CoreNLP 分词 + lowercase，但本文的还是用 gensim.utils.simple_preprocess 的旧方法，代码未更新（flag：以后有空更），请酌情参考 [8] 的新方法重制。
参照 [8]，5 句话拼在一起，提取 Doc2Vec features。
本节所用环境同 [8] 的对应一节：[11] 的容器创建 python 2.7 的虚拟环境 + [9] 的预训练 Doc2Vec 模型 + [10] 的旧版 gensim。
python 2 的 HTMLParser 包名不同 python 3，见 [12] 和下面代码。

# make.text.py
from __future__ import print_function
import os
import os.path as osp
from HTMLParser import HTMLParser  # python 2
# from html.parser import HTMLParser  # python 3
import numpy as np
import scipy.io as sio
import gensim
from gensim.models import Doc2Vec


"""text Doc2Vec feature
run `make.id-map.py` before this file
"""


P = "/home/dataset/pascal-sentences"
HTML_F = osp.join(P, "page-source.html")

# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
DIM = 300  # dimension of the doc2vec feature

IN_DOCKER = True
USER_ID = 1000


with open(HTML_F, "r") as f:
    html_txt = f.readlines()
# print(html_txt)
html_txt = "".join(html_txt)


class ParserText(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.tr_layer = 0
        self.current_img = None
        self.current_txt = []
        self.current_tag = None
        self.img_txt = {}

    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        if "img" == tag:
            assert len(attrs) == 1
            _cls, _img_f = attrs[0][1].split("/")
            # print(_cls, _img_f)
            self.current_img = _img_f
        elif "tr" == tag:
            self.tr_layer += 1

    def handle_endtag(self, tag):
        if "tr" == tag:
            self.tr_layer -= 1
            if 0 == self.tr_layer:
                # assert 5 == len(self.current_txt)
                self.current_txt = "".join(self.current_txt)
                self.img_txt[self.current_img] = self.current_txt
                # print('\t', self.current_img, '\n', self.current_txt)
                self.current_img = None
                self.current_txt = []

        self.current_tag = None

    def handle_data(self, data):
        if (2 == self.tr_layer) and ("td" == self.current_tag):
            # print(data)
            self.current_txt.append(data)


print("parse sentences")
parser = ParserText()
parser.feed(html_txt)


print("read sample order")
img_id = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
    for line in f:
        line = line.strip()
        if line:
            sid, img_f = line.split()
            img_id[img_f] = int(sid)


print("text")
assert len(parser.img_txt) == len(img_id)
N_DATA = len(parser.img_txt)
print("#data:", N_DATA)  # 1000
texts = np.zeros([N_DATA, DIM], dtype=np.float32)

model = Doc2Vec.load(MODEL)

for img, txt in parser.img_txt.items():
    sid = img_id[img]
    doc = gensim.utils.simple_preprocess(doc)
    vec = model.infer_vector(doc)
    texts[sid] = vec
# (1000, 300) -0.6507467 0.6664893 -0.0071584913 -2147.5474
print("texts:", texts.shape, texts.min(), texts.max(), texts.mean(), texts.sum())

_f_name = osp.join(P, "texts.pascal-sentences.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_name, {"texts": texts})
if IN_DOCKER:
    os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

Cloud Drive

百度网盘：https://pan.baidu.com/s/1QfyhxPLjPfQS5JdHWh4HTQ，提取码：lwbd。
pascal-sentences

References

HackerTom

关注

5
点赞
踩
9

收藏

觉得还不错? 一键收藏
1
评论
Pascal Sentences数据集预处理

Pascal Sentences[1] 包含来自 VOC 2008[2] 的 1000 对图文对，每幅图有 5 句描述，现参照 [3] 处理。Data数据在 [1]，其中 label 是藏在 image 的链接路径中（可右键图片在新窗口打开看网址，或在浏览器 F12 打开调试查看元素）。直接保存网页，会下载得到 pascal-sentences.htm 文件、pascal-sentences_files/ 目录。其中，images 装在目录里，sentences 嵌在 html 文件中。原本（在浏览
复制链接

扫一扫