## 使用Keras Tuner对fasttext模型进行超参数调优
# pip install keras-tuner==1.0.1
from kerastuner.tuners import RandomSearch
from kerastuner.tuners import BayesianOptimization
### 使用了tf中的keras,和keras tuner的tensorflow版本依然是1.14.0
# 导入用于对象保存与加载的joblib
# 导入keras中的词汇映射器Tokenizer
# 导入从样本csv到内存的get_data_labels函数
import os
from sklearn.externals import joblib
from keras.preprocessing.text import Tokenizer
from data_analysis import get_data_labels
def word_map(csv_path, tokenizer_path, cut_num):
"""进行词汇映射,以训练数据的csv路径和映射器存储路径以及截断数为参数"""
# 使用get_data_labels函数获取简单处理后的训练数据和标签
train_data, train_labels = get_data_labels(csv_path)
# 进行正负样本均衡切割, 使其数量比例为1:1
train_data = train_data[:-cut_num]
train_labels = train_labels[:-cut_num]
# 实例化一个词汇映射器对象
t = Tokenizer(num_words=None, char_level=False)
# 使用映射器拟合现有文本数据
t.fit_on_texts(train_data)
# 使用joblib工具保存映射器
joblib.dump(t, tokenizer_path)
# 使用映射器转化现有文本数据
x_train = t.texts_to_sequences(train_data)
# 获得标签数据
y_train = train_labels
return x_train, y_train
from keras.preprocessing import sequence
# cutlen根据数据分析中句子长度分布,覆盖90%语料的最短长度.
cutlen = 60
def padding(x_train, cutlen):
return sequence.pad_sequences(x_train, cutlen)
import numpy as np
# 根据样本集最大词汇数选择最大特征数,应大于样本集最大词汇数
max_features = 25000
# n-gram特征的范围,一般选择为2
ngram_range = 2
def create_ngram_set(input_list, ngram_value=2):
"""
从列表中提取n-gram特征
>>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
{(4, 9), (4, 1), (1, 4), (9, 4)}
"""
return set(zip(*[input_list[i:] for i in range(ngram_value)]))
def get_ti_and_nmf(x_train, ti_path, ngram_range):
"""从训练数据中获得token_indice和新的max_features"""
# >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
# 创建一个存储n-gram特征的集合.
ngram_set = set()
# 遍历每一个数值映射后的列表
for input_list in x_train:
# 遍历可能存在2-gram, 3-gram等
for i in range(2, ngram_range + 1):
# 获得对应的n-gram表示
set_of_ngram = create_ngram_set(input_list, ngram_value=i)
# 更新n-gram集合
ngram_set.update(set_of_ngram)
ngram_set.discard(tuple([0] * ngram_range))
# 将n-gram特征映射成整数
# 为了避免和之前的词汇特征冲突,n-gram产生的特征将从max_features+1开始
start_index = max_features + 1
# 得到对n-gram表示与对应特征值的字典
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
# 将token_indice写入文件以便预测时使用
with open(ti_path, "w") as f:
f.write(str(token_indice))
# token_indice的反转字典,为了求解新的最大特征数
indice_token = {token_indice[k]: k for k in token_indice}
# 获得加入n-gram之后的最大特征数
new_max_features = np.max(list(indice_token.keys())) + 1
return token_indice, new_max_features
def add_ngram(sequences, token_indice, ngram_range=2):
"""
将n-gram特征加入到训练数据中
如: adding bi-gram
>>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
>>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
>>> add_ngram(sequences, token_indice, ngram_range=2)
[[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
"""
new_sequences = []
# 遍历序列列表中的每一个元素作为input_list, 即代表一个句子的列表
for input_list in sequences:
# copy一个new_list
new_list = input_list[:].tolist()
# 遍历n-gram的value,至少从2开始
for ngram_value in range(2, ngram_range + 1):
# 遍历各个可能的n-gram长度
for i in range(len(new_list) - ngram_value + 1):
# 获得input_list中的n-gram表示
ngram = tuple(new_list[i: i + ngram_value])
# 如果在token_indice中,则追加相应的数值特征
if ngram in token_indice:
new_list.append(token_indice[ngram])
new_sequences.append(new_list)
return np.array(new_sequences)
def align(x_train):
"""用于向量按照最长长度进行补齐"""
# 获得所有句子长度的最大值
maxlen = max(list(map(lambda x: len(x), x_train)))
# 调用padding函数
x_train = padding(x_train, maxlen)
return x_train, maxlen
import tensorflow as tf
import tensorflow.keras as keras
from kerastuner import HyperModel
# 定义词嵌入维度为50
# embedding_dims = 50
class FasttextModel(HyperModel):
def __init__(self, maxlen, new_max_features):
self.maxlen = maxlen
self.new_max_features = new_max_features
def _model_build(self, hp):
"""该函数用于模型结构构建"""
# 在函数中,首先初始化一个序列模型对象
model = keras.models.Sequential()
# 然后首层使用Embedding层进行词向量映射
model.add(keras.layers.Embedding(self.new_max_features,
hp.Choice('embedding_dims', values=[50, 100]),
input_length=self.maxlen))
# 然后用构建全局平均池化层,减少模型参数,防止过拟合
model.add(keras.layers.GlobalAveragePooling1D())
# 最后构建全连接层 + sigmoid层来进行分类.
model.add(keras.layers.Dense(1, activation='sigmoid'))
# 调节优化器
model.compile(loss='binary_crossentropy',
optimizer=hp.Choice('optimizer', values=['Adam', 'Adagrad']),
metrics=['accuracy'])
return model
### 保存检测点并使用TensorBoard
log_dir = "./log_dir"
# tb_callback = keras.callbacks.TensorBoard(log_dir=log_dir)
# profile_batch='10, 15')
### 检查点保存至的目录
# checkpoint_dir = './training_checkpoints'
# 检查点的文件名
# checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
# filepath=checkpoint_prefix,
# save_weights_only=True)
# batch_size是每次进行参数更新的样本数量
batch_size = 32
# epochs将全部数据遍历训练的次数
epochs = 10
def model_fit(x_train, y_train, maxlen, new_max_features):
"""用于模型训练"""
_model_build = FasttextModel(maxlen, new_max_features)._model_build
"""
tuner = RandomSearch(
_model_build,
# 调参器度量指标
objective='val_acc',
# 全局最多尝试的次数
max_trials=8,
# 每个调优参数的尝试次数
executions_per_trial=2,
directory='log_dir',
project_name='RStune')
"""
tuner = BayesianOptimization(
_model_build,
# 调参器度量指标
objective='val_acc',
# 全局最多尝试的次数
max_trials=8,
# 每个调优参数的尝试次数
executions_per_trial=2,
directory='log_dir',
project_name='BOtune')
tuner.search(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2)
# callbacks=[tb_callback, checkpoint_callback])
print("******************************")
tuner.results_summary()
# 得到调优后最好的1个模型
model = tuner.get_best_models(num_models=1)[0]
return model
"""
[Trial summary]
|-Trial ID: c6893c813541e06b645eb138e7107f3a
# 每次尝试(N epoch)的最优结果的均值为Score
|-Score: 0.0016163793625310063
|-Best step: 0
> Hyperparameters:
|-learning_rate: 0.0001
"""
def model_save(save_path, model):
"""模型保存函数"""
# 使用model.save对模型进行保存.
model.save(save_path)
return
def model_load(save_path, sample):
"""模型加载与预测函数"""
# 使用load_model方法进行加载
model = keras.models.load_model(save_path)
# 使用predict方法进行预测
result = model.predict(sample)
return result
if __name__ == "__main__":
csv_path = "./movie/sample.csv"
tokenizer_path = "./movie/Tokenizer"
cut_num = 2525
x_train, y_train = word_map(csv_path, tokenizer_path, cut_num)
# print(x_train)
# print(y_train)
x_train = padding(x_train, cutlen)
# print(x_train)
input_list = [1, 4, 9, 4, 1, 4]
ns = create_ngram_set(input_list)
# print(ns)
ti_path = "./movie/token_indice"
token_indice, new_max_features = get_ti_and_nmf(x_train, ti_path, ngram_range)
# print(token_indice, new_max_features)
x_train = add_ngram(x_train, token_indice, ngram_range)
x_train, maxlen = align(x_train)
model = model_fit(x_train, y_train, maxlen, new_max_features)
# print("打开tensorboard命令:" + "tensorboard --logdir ./log_dir/ --host 0.0.0.0 --port 8789")
save_path = "./movie/model.h5"
model_save(save_path, model)
使用Keras Tuner对fasttext模型进行超参数调优
于 2020-06-20 22:04:05 首次发布