def predict(
dim_proj=128, # word embedding的维数和隐藏层的维数,用默认值。(word embedding是一种将一个词转成一个向量的过程,这里不去深究)
patience=10, # 该参数用于earlystop,如果10轮迭代的误差没有降低,就进行earlystop
max_epochs=4000, # 迭代次数(将训练集迭代一轮为一个epoch, 每个迭代周期迭代所有的训练集)
dispFreq=10, # 每更新10次显示训练过程,即显示训练、验证和测试误差
decay_c=0., # 参数U的正则权重,U为隐藏层ht到输出层的参数
lrate=0.0001, # sgd用的学习率
n_words=10000, # 词典大小,用于数据预处理部分,将词用该词在词典中的ID表示,超过10000的用1表示,仅仅用于数据,不做深究
optimizer=adadelta, # 优化方法,代码提供了sgd,adadelta和rmsprop三种方法,采用了adadelta.
encoder='lstm', # 一个标识符,可以去掉,但是必须是lstm
saveto='model/lstm_model.npz', # 保存最好模型的文件,保存训练误差,验证误差和测试误差等等
validFreq=370, # 验证频率 Compute the validation error after this number of update.
saveFreq=1110, # 保存频率 Save the parameters after every saveFreq updates
maxlen=100, # 序列的最大长度,超出长度的数据被抛弃,见数据处理部分
batch_size=16, # 训练的batch大小.
valid_batch_size=64, # 验证集用的*batch大小.
dataset='imdb', # 用于数据预处理的参数,全局变量datasets的key'imdb'的value为两个处理数据的函数
noise_std=0., # 后边好像没有出现过,
use_dropout=True, # 控制dropout,不用dropout的话运行速度较快,但是效果不好,dropout不太好解释,以一定的概率随机丢弃模型中的一些节点,
# 这样可以综合多种模型的结果,进行投票。需要自行补充deeplearning的知识
reload_model=None, # 加载模型参数的文件,用于已训练好的模型,或保存的中间结果
test_size=-1, # 测试集大小,如果为正,就只用这么多测试样本
):
# Model options
# 首先将当先的函数局部作用于的参数copy到字典model_options中,后面的很多函数就以model_options作为参数进行参数传递。
model_options = locals().copy() # 它将函数中所有参数复制,保存为一个词典
print("model options", model_options)
# 返回了两个函数:load_data,prepare_data这两个函数定义在imdb.py中
# 数据已经事先存在了imdb.pkl中,这里用pickle方法load进来,第一项为训练数据,第二项为测试数据;
# load_data函数将数据集读入,舍弃长度超过maxlen的数据,并按照参数valid_portion的比例将一部分训练集划为验证集。
# 而第二个函数prepare_data负责数据的转换,在训练和测试的时候先将训练数据和测试数据的横轴和纵轴调换,并使数据维度保持一致,后面详细讲
load_data, prepare_data = get_dataset(dataset)
print('Loading data')
# train 格式:(x集合,y集合), 其中x集合 = [word1索引,word2索引,word3索引]
train, valid = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen)
# 如果我们设置了test_size的大小,这个步骤就是从测试集中随机找test_size个作为测试数据,
# 如果没有设置test_size,会用所有的测试集数据做测试。
# 原来的测试数据是根据长度排列的(imdb数据自身特点),这里做了一次打散
# ydim为标签y的维数,因为是从0开始的,所以后面+1,并将它加入模型参数中
ydim = numpy.max(train[1]) + 1
model_options['ydim'] = ydim
print('Building model')
# 模型建立阶段,首先初始化各种参数,调用了全局作用域的函数init_params()
params = init_params(model_options)
if reload_model:
load_params('lstm_model.npz', params)
tparams = init_tparams(params)
# 建立模型
(use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
start_time = time.time()
kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
for _, train_index in kf:
# Select the random examples for this minibatch
y = [train[1][t] for t in train_index]
x = [train[0][t] for t in train_index]
x, mask, y = prepare_data(x, y)
print (x.shape)
res = pred_probs(f_pred_prob, prepare_data, train, kf, True)
print("【RES】", res.shape)
# print(res)
for r in res:
print ("%s + %s = %f"%(r[0], r[1], float(r[0])+float(r[1])))
运行结果
(82, 16)
16/38 samples classified
32/38 samples classified
38/38 samples classified
【RES】 (38, 2)
0.500000280282 + 0.499999719718 = 1.000000
0.500022365161 + 0.499977634839 = 1.000000
0.499994092235 + 0.500005907765 = 1.000000
0.500028769156 + 0.499971230844 = 1.000000
0.500006347231 + 0.499993652769 = 1.000000
0.49999745113 + 0.50000254887 = 1.000000
0.500004658485 + 0.499995341515 = 1.000000
0.500028498057 + 0.499971501943 = 1.000000
0.500004125404 + 0.499995874596 = 1.000000
0.500034840627 + 0.499965159373 = 1.000000
0.500010874254 + 0.499989125746 = 1.000000
0.500021216939 + 0.499978783061 = 1.000000
0.500021058867 + 0.499978941133 = 1.000000
0.50002712303 + 0.49997287697 = 1.000000
0.500010389075 + 0.499989610925 = 1.000000
0.500021262485 + 0.499978737515 = 1.000000
0.50002967502 + 0.49997032498 = 1.000000
0.500016653088 + 0.499983346912 = 1.000000
0.500026446793 + 0.499973553207 = 1.000000
0.50002089169 + 0.49997910831 = 1.000000
0.5000127288 + 0.4999872712 = 1.000000
0.500019416336 + 0.499980583664 = 1.000000
0.500029114262 + 0.499970885738 = 1.000000
0.500021365958 + 0.499978634042 = 1.000000
0.500013568593 + 0.499986431407 = 1.000000
0.500016869466 + 0.499983130534 = 1.000000
0.500021213203 + 0.499978786797 = 1.000000
0.500023013772 + 0.499976986228 = 1.000000
0.500015465529 + 0.499984534471 = 1.000000
0.500027792589 + 0.499972207411 = 1.000000
0.500019467854 + 0.499980532146 = 1.000000
0.500032826809 + 0.499967173191 = 1.000000
0.500023604551 + 0.499976395449 = 1.000000
0.500019634245 + 0.499980365755 = 1.000000
0.500024585975 + 0.499975414025 = 1.000000
0.500033565917 + 0.499966434083 = 1.000000
0.500028160005 + 0.499971839995 = 1.000000
0.500022054487 + 0.499977945513 = 1.000000
(83, 16)
16/38 samples classified
32/38 samples classified
38/38 samples classified
【RES】 (38, 2)
0.500000280282 + 0.499999719718 = 1.000000
0.500022365161 + 0.499977634839 = 1.000000
0.499994092235 + 0.500005907765 = 1.000000
0.500028769156 + 0.499971230844 = 1.000000
0.500006347231 + 0.499993652769 = 1.000000
0.49999745113 + 0.50000254887 = 1.000000
0.500004658485 + 0.499995341515 = 1.000000
0.500028498057 + 0.499971501943 = 1.000000
0.500004125404 + 0.499995874596 = 1.000000
0.500034840627 + 0.499965159373 = 1.000000
0.500010874254 + 0.499989125746 = 1.000000
0.500021216939 + 0.499978783061 = 1.000000
0.500021058867 + 0.499978941133 = 1.000000
0.50002712303 + 0.49997287697 = 1.000000
0.500010389075 + 0.499989610925 = 1.000000
0.500021262485 + 0.499978737515 = 1.000000
0.50002967502 + 0.49997032498 = 1.000000
0.500016653088 + 0.499983346912 = 1.000000
0.500026446793 + 0.499973553207 = 1.000000
0.50002089169 + 0.49997910831 = 1.000000
0.5000127288 + 0.4999872712 = 1.000000
0.500019416336 + 0.499980583664 = 1.000000
0.500029114262 + 0.499970885738 = 1.000000
0.500021365958 + 0.499978634042 = 1.000000
0.500013568593 + 0.499986431407 = 1.000000
0.500016869466 + 0.499983130534 = 1.000000
0.500021213203 + 0.499978786797 = 1.000000
0.500023013772 + 0.499976986228 = 1.000000
0.500015465529 + 0.499984534471 = 1.000000
0.500027792589 + 0.499972207411 = 1.000000
0.500019467854 + 0.499980532146 = 1.000000
0.500032826809 + 0.499967173191 = 1.000000
0.500023604551 + 0.499976395449 = 1.000000
0.500019634245 + 0.499980365755 = 1.000000
0.500024585975 + 0.499975414025 = 1.000000
0.500033565917 + 0.499966434083 = 1.000000
0.500028160005 + 0.499971839995 = 1.000000
0.500022054487 + 0.499977945513 = 1.000000
(33, 6)
16/38 samples classified
32/38 samples classified
38/38 samples classified
【RES】 (38, 2)
0.500000280282 + 0.499999719718 = 1.000000
0.500022365161 + 0.499977634839 = 1.000000
0.499994092235 + 0.500005907765 = 1.000000
0.500028769156 + 0.499971230844 = 1.000000
0.500006347231 + 0.499993652769 = 1.000000
0.49999745113 + 0.50000254887 = 1.000000
0.500004658485 + 0.499995341515 = 1.000000
0.500028498057 + 0.499971501943 = 1.000000
0.500004125404 + 0.499995874596 = 1.000000
0.500034840627 + 0.499965159373 = 1.000000
0.500010874254 + 0.499989125746 = 1.000000
0.500021216939 + 0.499978783061 = 1.000000
0.500021058867 + 0.499978941133 = 1.000000
0.50002712303 + 0.49997287697 = 1.000000
0.500010389075 + 0.499989610925 = 1.000000
0.500021262485 + 0.499978737515 = 1.000000
0.50002967502 + 0.49997032498 = 1.000000
0.500016653088 + 0.499983346912 = 1.000000
0.500026446793 + 0.499973553207 = 1.000000
0.50002089169 + 0.49997910831 = 1.000000
0.5000127288 + 0.4999872712 = 1.000000
0.500019416336 + 0.499980583664 = 1.000000
0.500029114262 + 0.499970885738 = 1.000000
0.500021365958 + 0.499978634042 = 1.000000
0.500013568593 + 0.499986431407 = 1.000000
0.500016869466 + 0.499983130534 = 1.000000
0.500021213203 + 0.499978786797 = 1.000000
0.500023013772 + 0.499976986228 = 1.000000
0.500015465529 + 0.499984534471 = 1.000000
0.500027792589 + 0.499972207411 = 1.000000
0.500019467854 + 0.499980532146 = 1.000000
0.500032826809 + 0.499967173191 = 1.000000
0.500023604551 + 0.499976395449 = 1.000000
0.500019634245 + 0.499980365755 = 1.000000
0.500024585975 + 0.499975414025 = 1.000000
0.500033565917 + 0.499966434083 = 1.000000
0.500028160005 + 0.499971839995 = 1.000000
0.500022054487 + 0.499977945513 = 1.000000
说明:测试样本数 38,处理与训练时相同,输入格式为
[(样本一,[词索引]),(样本二,[词索引]),(样本三,[词索引])]
mini_batch = 16,因此迭代次数为 3,返回结果中:返回两个类别的概率,总和为1.0,从上面的结果可以看出,模型的效果非常差。