Pascal Sentences数据集预处理

Pascal Sentences[1] 包含来自 VOC 2008[2]1000 对图文对,多数图有 5 句描述(但是全部),现参照 [3] 处理。

Data

数据在 [1],其中 label 是藏在 image 的链接路径中(可右键图片在新窗口打开看网址,或在浏览器 F12 打开调试查看元素)。

直接保存网页,会下载得到 pascal-sentences.htm 文件、pascal-sentences_files/ 目录。其中,images 装在目录里,sentences 嵌在 html 文件中。

原本(在浏览器中调试)各 images 对应的 class name 也是嵌在 html 文件里对应的 image 路径中,但下载的 html 文件里 image 路径却变了(简单指向 pascal-sentences_files/)。为得到 class name,需要在浏览器的页面中查看网页源码(右键 view page source),然后手动复制一份 html 文件,这里存为 page-source.html

所有文件都下在 pascal-sentences/ 里。

Sample Order

  • 按 image ID 的序排列数据
  • image 文件名形如:2008_000032.jpg,其中 2008 应该是对应 VOC 2008,忽略;000032 就当成 image ID。
  • 写入 id-map.pascal-sentences.txt
# make.id-map.py
import os
import os.path as osp


P = "G:/dataset/pascal-sentences"
IMAGE_P = osp.join(P, "pascal-sentences_files")


# format: `2008_000032.jpg`
id_key = lambda x: int(x.split(".jpg")[0].split("2008_")[1])

img_files = os.listdir(IMAGE_P)
img_files = sorted(img_files, key=id_key)
print(img_files[:10])

with open(osp.join(P, "id-map.pascal-sentences.txt"), "w") as f:
    for sid, img_f in enumerate(img_files):
        f.write("{} {}\n".format(sid, img_f))

Class Order and Labels

  • HTMLParser 解析 html 文件,参考 [4,5]
  • class 顺序按 class name 序排,写入 class-name.pascal-sentences.txt,为后面 class embedding 准备。
  • labels 顺序同 id-map.pascal-sentences.txt
# make.label.py
import os
import os.path as osp
from html.parser import HTMLParser
import numpy as np
import scipy.io as sio


P = "G:/dataset/pascal-sentences"
HTML_F = osp.join(P, "page-source.html")


with open(HTML_F, "r") as f:
    html_txt = f.readlines()
# print(html_txt)
html_txt = "".join(html_txt)


class ParserLabel(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.img_cls = {}
        self.class_set = set()

    def handle_starttag(self, tag, attrs):
        if "img" != tag:
            return
        assert len(attrs) == 1
        _cls, _img_f = attrs[0][1].split("/")
        # print(_cls, _img_f)
        self.class_set.add(_cls)
        self.img_cls[_img_f] = _cls


print("parse annotations")
parser = ParserLabel()
parser.feed(html_txt)

print("class order")
class_set = sorted(list(parser.class_set))
cls_id = {}
with open(osp.join(P, "class-name.pascal-sentences.txt"), "w") as f:
    for cid, c in enumerate(class_set):
        f.write("{} {}\n".format(cid, c))
        cls_id[c] = cid


print("read sample order")
img_id = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
    for line in f:
        line = line.strip()
        if line:
            sid, img_f = line.split()
            img_id[img_f] = int(sid)


print("label")
assert len(parser.img_cls) == len(img_id)
N_DATA = len(parser.img_cls)
print("#data:", N_DATA)  # 1000
labels = np.zeros([N_DATA], dtype=np.int32)
for img, c in parser.img_cls.items():
    sid = img_id[img]
    cid = cls_id[c]
    labels[sid] = cid

# (1000,) 0 19 9.5 9500
print("labels:", labels.shape, labels.min(), labels.max(), labels.mean(), labels.sum())
sio.savemat(osp.join(P, "labels.pascal-sentences.mat"),
    {"labels": labels}, do_compression=True)

Class Embeddings

  • class name 的 Word2Vec feature,顺序同 class-name.pascal-sentences.txt
  • 用到 GoogleNews-vectors-negative300.bin,下载见 [6,7]。
  • 替换了一些该 Word2Vec 模型没有的词(组),见下面代码;词组拆成多个词取平均。
# make.w2v.py
import os
import os.path as osp
import numpy as np
import scipy.io as sio
# from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors


"""class Word2Vec embeddings
run `make.label.py` first
"""


P = "/home/dataset/pascal-sentences"
CLASS_NAME_F = osp.join(P, "class-name.pascal-sentences.txt")

# word2vec
MODEL = "/home/dataset/word2vec/GoogleNews-vectors-negative300.bin"

IN_DOCKER = True
USER_ID = 1000

cls_set = []
with open(CLASS_NAME_F, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            cid, cn = line.split()
            if "aeroplane" == cn:
                cn = "airplane"
            elif "diningtable" == cn:
                cn = "dining_table"
            elif "pottedplant" == cn:
                cn = "potted_plant"
            elif "tvmonitor" == cn:
                cn = "TV_monitor"
            cls_set.append(cn)
print("classes:", len(cls_set), cls_set)


w2v = KeyedVectors.load_word2vec_format(MODEL, binary=True)


print("find out the absent class names")
_file_name = osp.join(P, "absent-class-name.txt")
with open(_file_name, "w") as f:
    for c in cls_set:
        if c not in w2v:
            print(c)
            f.write("{}\n".format(c))
if IN_DOCKER:
    os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))


"""absent class names
aeroplane -> airplane
diningtable -> dining_table
pottedplant -> potted_plant
tvmonitor -> TV_monitor
"""


print("class embedding")
class_emb = []
for c in cls_set:
    if c in w2v:
        class_emb.append(w2v[c])
    else:
        assert "_" in c, "absent single word: {}".format(c)
        c_list = c.split("_")
        tmp = 0
        for _c in c_list:
            tmp = tmp + w2v[_c]
        tmp /= len(c_list)
        class_emb.append(tmp)

class_emb = np.vstack(class_emb).astype(np.float32)
# (20, 300) -0.62109375 0.62890625 -0.010865917 -65.1955
print("class emb:", class_emb.shape, class_emb.min(), class_emb.max(), class_emb.mean(), class_emb.sum())

_file_name = osp.join(P, "class_emb.pascal-sentences.Gnews-300d.mat")
sio.savemat(_file_name, {"class_emb": class_emb})
# because I run this script in a docker container,
# I shall change the owership & group of this file for convenience
os.system("chown {0}:{0} {1}".format(USER_ID, _file_name))

Images

  • images 的 VGG 19 feature
  • 顺序同 id-map.pascal-sentences.txt
# make.image.py
import os
import os.path as osp
import numpy as np
import scipy.io as sio
# import h5py
import cv2
from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transforms


P = "/home/dataset/pascal-sentences"
IMAGE_P = osp.join(P, "pascal-sentences_files")

BATCH_SIZE = 100

IN_DOCKER = True
USER_ID = 1000


print("read sample order")
id_img = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
    for line in f:
        line = line.strip()
        if line:
            sid, img_f = line.split()
            id_img[int(sid)] = img_f
N = len(id_img)
print("#image:", N)


model = models.vgg19(pretrained=True)
model.classifier = model.classifier[:-2]
model = model.cuda()

trsf = transforms.Compose([
    # transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(
        [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])


fea_list = []
with torch.no_grad():
    for i in range(0, N, BATCH_SIZE):
        image_batch = []
        for sid in range(i, min(i + BATCH_SIZE, N)):
            img_p = osp.join(IMAGE_P, id_img[sid])
            img = cv2.imread(img_p)#[:, :, ::-1]
            if img is None:
                img_f = Image.open(img_p)
                img = np.asarray(img_f)
                img_f.close()
                if 2 == img.ndim:
                    img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_LINEAR)
            img = trsf(Image.fromarray(img))
            # print(img.size())
            image_batch.append(img.unsqueeze(0))

        image_batch = torch.cat(image_batch, 0)
        # print(image_batch.size())
        image_batch = image_batch.cuda()
        fea = model(image_batch).cpu().numpy()
        # print(fea.shape)
        fea_list.append(fea)

        print(i)
        # if i > 0: break

Fea = np.vstack(fea_list).astype(np.float32)
print("image features:", Fea.shape)
_f_name = osp.join(P, "images.pascal-sentences.vgg19.{}d.mat".format(Fea.shape[1]))
sio.savemat(_f_name, {"images": Fea}, do_compression=True)
if IN_DOCKER:
    os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

Texts

  • (2022.7.11 Notes)[8] 中文本的预处理方法已改为 Stanford CoreNLP 分词 + lowercase,但本文的还是用 gensim.utils.simple_preprocess方法,代码更新(flag:以后有空更),请酌情参考 [8] 的新方法重制。
  • 参照 [8],5 句话拼在一起,提取 Doc2Vec features。
  • 本节所用环境同 [8] 的对应一节:[11] 的容器创建 python 2.7 的虚拟环境 + [9] 的预训练 Doc2Vec 模型 + [10] 的旧版 gensim。
  • python 2 的 HTMLParser 包名同 python 3,见 [12] 和下面代码。
# make.text.py
from __future__ import print_function
import os
import os.path as osp
from HTMLParser import HTMLParser  # python 2
# from html.parser import HTMLParser  # python 3
import numpy as np
import scipy.io as sio
import gensim
from gensim.models import Doc2Vec


"""text Doc2Vec feature
run `make.id-map.py` before this file
"""


P = "/home/dataset/pascal-sentences"
HTML_F = osp.join(P, "page-source.html")

# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
DIM = 300  # dimension of the doc2vec feature

IN_DOCKER = True
USER_ID = 1000


with open(HTML_F, "r") as f:
    html_txt = f.readlines()
# print(html_txt)
html_txt = "".join(html_txt)


class ParserText(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.tr_layer = 0
        self.current_img = None
        self.current_txt = []
        self.current_tag = None
        self.img_txt = {}

    def handle_starttag(self, tag, attrs):
        self.current_tag = tag
        if "img" == tag:
            assert len(attrs) == 1
            _cls, _img_f = attrs[0][1].split("/")
            # print(_cls, _img_f)
            self.current_img = _img_f
        elif "tr" == tag:
            self.tr_layer += 1

    def handle_endtag(self, tag):
        if "tr" == tag:
            self.tr_layer -= 1
            if 0 == self.tr_layer:
                # assert 5 == len(self.current_txt)
                self.current_txt = "".join(self.current_txt)
                self.img_txt[self.current_img] = self.current_txt
                # print('\t', self.current_img, '\n', self.current_txt)
                self.current_img = None
                self.current_txt = []

        self.current_tag = None

    def handle_data(self, data):
        if (2 == self.tr_layer) and ("td" == self.current_tag):
            # print(data)
            self.current_txt.append(data)


print("parse sentences")
parser = ParserText()
parser.feed(html_txt)


print("read sample order")
img_id = {}
with open(osp.join(P, "id-map.pascal-sentences.txt"), "r") as f:
    for line in f:
        line = line.strip()
        if line:
            sid, img_f = line.split()
            img_id[img_f] = int(sid)


print("text")
assert len(parser.img_txt) == len(img_id)
N_DATA = len(parser.img_txt)
print("#data:", N_DATA)  # 1000
texts = np.zeros([N_DATA, DIM], dtype=np.float32)

model = Doc2Vec.load(MODEL)

for img, txt in parser.img_txt.items():
    sid = img_id[img]
    doc = gensim.utils.simple_preprocess(doc)
    vec = model.infer_vector(doc)
    texts[sid] = vec
# (1000, 300) -0.6507467 0.6664893 -0.0071584913 -2147.5474
print("texts:", texts.shape, texts.min(), texts.max(), texts.mean(), texts.sum())

_f_name = osp.join(P, "texts.pascal-sentences.doc2vec.{}.mat".format(DIM))
sio.savemat(_f_name, {"texts": texts})
if IN_DOCKER:
    os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

Cloud Drive

百度网盘:https://pan.baidu.com/s/1QfyhxPLjPfQS5JdHWh4HTQ,提取码:lwbd
pascal-sentences

References

  1. Pascal Sentences
  2. Visual Object Classes Challenge 2008 (VOC2008)
  3. TCSVT 2020 | Zero-Shot Cross-Media Embedding Learning With Dual Adversarial Distribution Network
  4. html.parser — 简单的 HTML 和 XHTML 解析器 <- python 3
  5. Python HTML操作(HTMLParser)
  6. GoogleNews-vectors-negative300.bin.gz
  7. nishankmahore/word2vec-flask-api
  8. MS COCO 2017数据集预处理
  9. jhlau/doc2vec
  10. jhlau/gensim
  11. pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
  12. HTMLParser — Simple HTML and XHTML parser <- python 2
  • 5
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
PASCAL Parts dataset是一个用于目标检测和分割的数据集,其中包含了20个物体类别和14个部位类别。在进行训练之前,需要对数据进行预处理以适应模型的输入格式。下面是一个简单的数据预处理流程: 1. 下载PASCAL Parts dataset并解压缩。 2. 将数据集中的图像和标注文件分别存放在两个文件夹中。 3. 读取图像和标注文件,将它们转换为模型所需的格式。对于图像,可以使用常见的图像处理库(如PIL或OpenCV)读取和处理图像。对于标注文件,可以使用xml解析库(如xml.etree.ElementTree)读取xml文件并提取出物体和部位的边界框坐标。将物体和部位的边界框坐标保存在一个列表中。 4. 将图像和标注文件划分为训练集、验证集和测试集。可以按照一定比例随机划分,也可以按照指定的列表划分。 5. 将图像和标注文件保存为模型所需的格式。对于常见的目标检测和分割模型,通常需要将图像和标注文件保存为图片和对应的标注文件(如COCO格式或PASCAL VOC格式)。 6. 在训练和测试时,需要使用相应的数据加载器(如PyTorch的DataLoader或TensorFlow的tf.data.Dataset)读取保存的图像和标注文件,并将它们转换为模型所需的输入格式(如Tensor或特定格式的字典)。 以上是一个简单的PASCAL Parts dataset的数据预处理流程,具体实现可根据不同的需求和模型进行调整。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值