由于NLP相关数据处理和训练过程中,会经常用到相同代码,现常用模块进行记录,部分需要根据自己需求进行更改
1、导入模块
import keras
import gensim
from pathlib import Path
import os
import numpy as np
from keras.layers import *
from keras.optimizers import SGD, Adam
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
2、显存使用限制
print(tf.__version__)
# 对于 tensorflow1.x系列用这个
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.02
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
KTF.set_session(sess)
# # tensorflow2.x系列可以使用这个
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
# tf.config.experimental.set_memory_growth(gpu,True)
3、数据迭代器(训练和预测都可以使用的迭代器)
# 数据处理
# 数据处理
class DataProcess(object):
def __init__(self, data=None, batch_size=None, label2id=None, max_len=None, if_shuff=True, if_training=True):
"""
max_len:模型允许最大长度
if_train:是否训练阶段,预测阶段只需返回X
"""
self.data = data
self.batch_size = batch_size
self.label2id = label2id
self.max_len = max_len
self.shuffle_flag = if_shuff
self.data_len = len(self.data) if data else 1
self.training_flag=if_training
self.steps = self.data_len // self.batch_size
if self.data_len % self.batch_size != 0:
self.steps += 1
def __len__(self):
"""
在fit_generator中 返回迭代步数
"""
return self.steps
def sequence_padding(self, X, padding=0):
"""
用来统一批次最大长度
"""
# 计算批次最大长度
ML = max([len(x) for x in X])
# 如果ML超出最大长度
if ML > self.max_len and self.max_len:
ML = self.max_len
outputs = np.array([np.concatenate([x, [padding] * (ML - len(x))])
if len(x) < ML else x[:ML] for x in X])
return outputs
def trans(self, line):
"""
数据预处理过程
"""
if self.training_flag:
l, line = line
else:
line = line
......
if self.training_flag:
return np.array(x_5), np.array(l, dtype='int32')
else:
return np.array(x_5)
def __iter__(self, input_line=None):
"""
数据迭代器:
input_line:用来做预测推理的时候输入
if_training=True,即训练阶段返回 x,y
if_training=False,即预测阶段只返回 x
"""
while True:
if input_line:
self.data = input_line
self.data_len = len(input_line)
self.shuffle_flag = False
iter_data = self.data
# 是否打乱,预测时不打乱更方便
if self.shuffle_flag:
random.shuffle(iter_data)
X, Y = [], []
for line_num, line in enumerate(iter_data):
if self.training_flag:
x, label = self.trans(line)
else:
x = self.trans(line)
X.append(x)
if self.training_flag:
Y.append(label)
else:
pass
if len(X) == self.batch_size or line_num == self.data_len - 1:
X = np.array(X)
# print('X', X.shape)
if self.training_flag:
Y = np.array(Y)
# print('Y', Y.shape)
yield X, Y
X, Y = [], []
else:
yield X
X, Y = [], []
查看数据
train_iter = DataProcess(train_db,batch_size=10,label2id=labels, max_len=30)
dev_iter = DataProcess(dev_db,batch_size=10,label2id=labels,max_len=30)
for i in train_iter:
print(i[0][0].shape)
print(i[0][1].shape)
break
next(train_iter.__iter__())[1][0]
4、构建模型
def build_model():
x = Input(shape=(1,), name='position_vec')
......
tn_model = keras.models.Model(input=[x], output=[out])
return tn_model
tn_model = build_model()
adam = Adam(0.001)
tn_model.compile(loss=keras.losses.sparse_categorical_crossentropy,optimizer=adam,metrics=['accuracy'])
tn_model.summary()
5、回调函数
checkpoint = keras.callbacks.ModelCheckpoint('./gensim_keras_model.ckpt', monitor='val_acc',verbose=1,
save_best_only=True,save_weights_only=True,mode='max')
tn_model.fit_generator(train_iter.__iter__(),
steps_per_epoch=len(train_iter),
epochs=10,
validation_data=dev_iter.__iter__(),
validation_steps=len(dev_iter),
callbacks=[checkpoint])
6、预测函数
test_data = [l[1] for l in train_db]
# 预测模块
test_iter = DataProcess(test_data,batch_size=1,label2id=labels, max_len=30, if_training=False)
l = tn_model.predict_generator(test_iter.__iter__(), steps=len(test_iter),verbose=1)
l
单条调用,测试速度
import time
steps = test_iter.__len__()
for n, test_line in enumerate(test_iter.__iter__()):
if n >= steps:
break
print(n)
a = time.time()
result = tn_model.predict(test_line)
print(time.time() - a)
# print(result)