wikipedia数据集预处理

HackerTom

已于 2023-02-23 13:42:04 修改

阅读量1.1w

点赞数 9

分类专栏：机器学习文章标签： VGG16 Keras word2vec wikipedia doc2vec

于 2020-02-25 11:39:25 首次发布

本文链接：https://blog.csdn.net/HackerTom/article/details/104491152

版权

机器学习专栏收录该内容

121 篇文章

订阅专栏

Notes

wikipedia^[1] 用于检索的数据集，包含 2866 个样本、10 个类，图像、文本两个模态。
想按照 [2] 的设置处理数据，而 [2] 的设置应该来自 [3]，即 images 用 CaffeNet^[4] 提取 fc7 层^[5] 的 4096 维特征，texts 用 word2vec^[6] 提取每个单词的 100 维词向量并取平均。
暂时用 Keras 预训练的 VGG16^[7,8] 代替 CaffeNet，参考 [12]；word2vec 特征用 gensim^[9] 库生成，参考 [13, 14]。

Data

从 [10] 下载，解压之后有 trainset_txt_img_cat.list 和 testset_txt_img_cat.list 两个文件，里面每行代表一个样本，分 3 列：text 文件名、image 文件名、class id。
text 数据在 texts/ 下，装在 .xml 文件里。本想用 minidom^[11] 解析，但因为一些特殊符号（比如单独的 &）解析不了，未找到好方法，暂时手动解析。
image 数据在 images/ 下，分类放在不同文件夹。

Code

import os
from os.path import join
import numpy as np
import scipy.io as sio

from gensim.models import Word2Vec

from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model


P = "wikipedia_dataset"
IMG_P = "images"
TXT_P = "texts"
TRAIN_LIST = "trainset_txt_img_cat.list"
TEST_LIST = "testset_txt_img_cat.list"

os.chdir(P)  # 切去解压目录
print(os.getcwd())

sample order

将 sample list 读出来，方便以同一顺序处理 images、texts、labels。
前 2173 个是原 training set，后 693 个是原 test set。

ls_img = []
ls_txt = []
ls_lab = []


with open("id-map.wiki.txt", "w") as f_out:
    sid = -1
	for fname in (TRAIN_LIST, TEST_LIST):
	    with open(fname, "r") as f_in:
	        for line in f_in:
                sid += 1
	            txt_f, img_f, lab = line.strip().split()
	            #txt_f = join(TXT_P, txt_f, ".xml")
	            #img_f = join(IMG_P, img_f, ".jpg")
	            ls_img.append(img_f)
	            ls_txt.append(txt_f)
                lab = int(lab) - 1  # shift to 0-base
	            ls_lab.append(lab)
                # format: <sid> <text file> <image file> <cid>
                f_out.write("{} {} {} {}\n".format(sid, txt_f, img_f, lab))

print(len(ls_img), len(ls_txt), len(ls_lab))

labels

~~labels 转成 one-hot 保存~~
就存原来的 class ID

labels = np.asarray(ls_lab)
print(labels.shape, np.max(labels), np.min(labels))  # (2866,) 9 0
# N_CLASS = np.max(labels)
# labels -= 1  # shift to [0, N_CLASS - 1]
# labels = np.eye(N_CLASS)[labels]  # to one-hot
# print(labels.shape)  # (2866, 10)
# np.save("labels.npy", labels)
sio.savemat("labels.wiki.mat", {"labels": labels}, do_compression=True)

texts

2021.5.30 updates：之前 word2vec 平均的旧方法放在 mean word2vec (deprecated) 一节。今新增一节 doc2vec，记录基于 gensim Doc2Vec 的处理方法，见下文。

mean word2vec (`deprecated`)

2021.5.30 updates：预训练 word2vec 的 wikipedia corpus 好像是不同于本文 wikipedia 数据集中的 text 数据，而是另外一个专门的 wikipedia English corpus，参见 [19 - 22]。所以此节中 word2vec 的训练 corpus 应该是用错了，且对 corpus 的预处理建议换成下一节 gensim 工具的方式，而不是像此节一样的手动处理。
手动解析 .xml，清除一些多余的符号

def parse(fn):
	"""手动解析 xml：读 <text> </text> 之间的部分"""
    res = ""
    flag = False
    with open(fn, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line == "</text>":
                break
            if flag:
                res += " " + line
            if line == "<text>":
                flag = True
    return res


def clean(strings, pattern):
	"""驱邪……"""
    return [s.replace(pattern, "") for s in strings]


"""解析 xml"""
sentences = []
for txt_f in ls_txt:
    txt_f = join(TXT_P, "{}.xml".format(txt_f))
    # print(txt_f)
    doc = parse(txt_f)  # 手动解析
    # doc = minidom.parse(txt_f).documentElement.getElementsByTagName("text")[0].childNodes[0].data
    words = doc.split()
    # 清除多余符号
    for pat in (",", ".", "!", "?", "''", "(", ")", "\"", ":", ";", "{", "}", "[", "]"):
        words = clean(words, pat)
    sentences.append(words)

print(len(sentences))


"""训练 word2vec 模型"""
# [3] 说用 skip-gram
w2v = Word2Vec(sentences, size=100, min_count=5, iter=50, sg=1)  # sg = skip-gram


"""提取文本特征"""
texts = np.zeros([len(sentences), 100])
for i, s in enumerate(sentences):
    cnt = 0
    for w in s:
        if w in w2v:
            cnt += 1
            texts[i] += w2v[w]
    # 取平均词向量
    texts[i] /= cnt

# 保存
np.save("texts.w2v.100.npy", texts)

doc2vec (a new way)

（2023.1.11）[18] 中的分词换了用 Stanford CoreNLP 做，而不是用 gensim.utils.simple_preprocess，但这里没有更新！应要换成 [18] 的方法，详见 [18]。
2021.5.30 更新，参考 [18]。
注意：此段程序独立于本文原文，所需环境也与之不同，详见 [18]。
用 Doc2Vec 模型处理 300-d 向量，且用 gensim 自带的预处理函数处理 text，就不用手动清除那些奇怪的符号（即上一节中手写的 clean 函数）。
需要去掉 stop words 吗？gensim.utils.simple_preprocess 好像并不自动去除。

from __future__ import print_function
import os
import os.path as osp
import re
import io
import numpy as np
import scipy.io as sio
import gensim
from gensim.models import Doc2Vec


"""process text with Doc2Vec
paser text in .xml files
ref:
- https://blog.csdn.net/HackerTom/article/details/117001560
"""


USER_ID = 1000  # 宿主机 user id
# wiki
P = "/home/tom/dataset/wikipedia"
ID_MAP_F = osp.join(P, "id-map.wiki.txt")
TEXT_P = osp.join(P, "texts")
# doc2vec
MODEL = "/home/dataset/Doc2Vec/enwiki_dbow/doc2vec.bin"
start_alpha = 0.01
infer_epoch = 1000
DIM = 300  # dimension of the doc2vec feature


text_files = []
with open(ID_MAP_F, "r") as f:
    for line in f:
        _, txt_f, _, _ = line.strip().split()
        text_files.append(txt_f)
print("#data:", len(text_files))


def parse(fn):
    res = ""
    flag = False
    with io.open(fn, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line == "</text>":
                break
            if flag:
                res += " " + line
            if line == "<text>":
                flag = True
    return res


model = Doc2Vec.load(MODEL)

texts = []
for txt_f in text_files:
    txt_f = osp.join(TEXT_P, "{}.xml".format(txt_f))
    #print(txt_f)
    doc = parse(txt_f)
    # print(doc)
    # (2023.1.11) [18] 的分词方法已换成用 Stanford CoreNLP 做
    #             但这里未更新 !!!
    #             去 [18] 看新的文本预处理方法
    doc = gensim.utils.simple_preprocess(doc)
    # print(doc)
    vec = model.infer_vector(doc)
    # print(vec.shape)  # (300,)
    texts.append(vec[np.newaxis, :])
    # break

texts = np.vstack(texts).astype(np.float32)
print("texts:", texts.shape, texts.dtype)  # (2866, 300) float32
_f_name = "texts.wiki.doc2vec.{}.mat".format(DIM)
sio.savemat(_f_name, {"texts": texts})
# 参考 [18]，因为在 docker 中运行，故需将文件权限换回宿主帐号
os.system("chown {0}:{0} {1}".format(USER_ID, _f_name))

self-made LDA (`deprecated`)

（2021.12.12）[26] 有开源，且其实验用到 wikipedia 的 LDA 特征。经其代码验证，此节所做 LDA 特征不能复现其文章结果，应该是错的，别用！而 [1] 提供的数据 [10] 中其实已经有 LDA 特征，可复现 [26] 的结果，详见下节。
按 [23]，将 texts 处理成 10-D LDA（Latent Dirichlet Allocation）特征，参考 [24]。
注：目前未找到可以对拍的数据（否则也不用自己做…），不敢包制法一定正确。
stop words list 从 [25] 下，文件名 stop_words_english.txt。

import io
import os
import os.path as osp
import numpy as np
import scipy.io as sio
import gensim
from gensim import corpora, models, similarities


P = "G:/wiki_top10cats"
ID_MAP_F = osp.join(P, "id-map.wiki.txt")
STOP_WORD_F = osp.join(P, "stop_words_english.txt")
TEXT_P = osp.join(P, "texts")
N_TOPIC = 10


print("stop words")
with open(STOP_WORD_F, "r", encoding='utf-8') as f:
    stop_words = [line.strip() for line in f]
print("#stop word:", len(stop_words))


print("text 文件顺序：按前述 id map")
text_files = []
with open(ID_MAP_F, "r") as f:
    for line in f:
        _, txt_f, _, _ = line.strip().split()
        text_files.append(txt_f)
print("#data:", len(text_files))  # 2866


def parse(fn):
    res = ""
    flag = False
    with io.open(fn, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line == "</text>":
                break
            if flag:
                res += " " + line
            if line == "<text>":
                flag = True
    return res


print("按顺序读 texts")
corpus = []
for txt_f in text_files:
    txt_f = osp.join(TEXT_P, "{}.xml".format(txt_f))
    #print(txt_f)
    doc = parse(txt_f)
    # print(doc)
    doc = gensim.utils.simple_preprocess(doc)
    doc = [w for w in doc if w not in stop_words]  # 去掉 stop words
    # print(doc)
    corpus.append(doc)
print("corpus:", len(corpus))  # 2866

dictionary = corpora.Dictionary(corpus)
print("vocab:", len(dictionary))  # 63218

print("转 BoW")
bow_list = [dictionary.doc2bow(doc) for doc in corpus]
print(len(bow_list), type(bow_list[0]))  # 2866 <class 'list'>

print("算 TF-IDF")
corpus_tfidf = models.TfidfModel(bow_list)[bow_list]

lda = models.LdaModel(corpus_tfidf, num_topics=N_TOPIC, id2word=dictionary,
    alpha=0.01, eta=0.01, minimum_probability=0.001,
    update_every=1, chunksize=100, passes=1)


print("应该是对每个主题的从属度？")
doc_topics = lda.get_document_topics(corpus_tfidf)
# 格式：[(topic_id, membership)]
print(doc_topics[0])

texts_lda = np.asarray(doc_topics)
print(texts_lda.shape)
texts_lda = texts_lda[:, :, 1]
# print(texts_lda[0])

# 保存
assert texts_lda.shape[1] == N_TOPIC
sio.savemat(osp.join(P, "texts.wiki.lda.{}.mat".format(N_TOPIC)), {"texts": texts_lda})

provided LDA

[1] 中说它有提供 LDA 特征，在 [10] 下载的 raw_features.mat 中，是按 training & test set 分好的。
样本顺序同前面 sample order 一节，即分别按照 trainset_txt_img_cat.list 和 testset_txt_img_cat.list 两个文件的顺序。
经 [26] 的程序验证，可以复现其文章结果，应该是对的。

import os.path as osp
import numpy as np
import scipy.io as sio

P = "G:/wiki_top10cats"
DATA_F = osp.join(P, "raw_features.mat")

data = sio.loadmat(DATA_F)
print(list(data.keys()))  # 'I_tr', 'I_te', 'T_tr', 'T_te'

print("先 train 后 test，同前面 sample order")
texts_lda = np.vstack([data["T_tr"], data["T_te"]])
print(texts_lda.shape)  # (2866, 10)

sio.savemat(osp.join(P, "texts.wiki.lda.{}.mat".format(texts_lda.shape[1])), {"texts": texts_lda})

images

将图片全部复制到同一个目录，方便操作。用 VGG16 提特征

可以不用复制，而用软链接，见 [15,16,17]。

ALL_IMG_P = "images_all"
if not os.path.exists(ALL_IMG_P):
    os.makedirs(ALL_IMG_P)


"""全复制到 ALL_IMG_P"""
for cls in os.listdir(IMG_P):
    cls_d = join(IMG_P, cls)
    # print(os.listdir(cls_d))
    for img in os.listdir(cls_d):
        # os.system("cp {} {}".format(join(cls_d, img), ALL_IMG_P))  # linux
        os.system("copy {} {}".format(join(cls_d, img), ALL_IMG_P))  # windows
print(len(os.listdir(ALL_IMG_P)))


"""提特征"""
base_model = VGG16(weights='imagenet')
# print(base_model.summary())
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)
# print(model.summary())

images = []
for i_name in ls_img:
    img_f = join(ALL_IMG_P, "{}.jpg".format(i_name))
    img = image.load_img(img_f, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    images.append(model.predict(x))

images = np.vstack(images)
print(images.shape)

# 保存
np.save("images.vgg16.npy", images)