题目:
数据包含2个csv文件:
train_set.csv:此数据集用于训练模型,每一行对应一篇文章。
文章分别在“字”和“词”的级别上做了脱敏处理。共有四列:
第一列是文章的索引(id);
第二列是文章正文在“字”级别上的表示,即字符相隔正文(article);
第三列是在“词”级别上的表示,即词语相隔正文(word_seg);
第四列是这篇文章的标注(class)。
注:每一个数字对应一个“字”,或“词”,或“标点符号”。“字”的编号与“词”的编号是独立的!
test_set.csv:此数据用于测试。数据格式同train_set.csv,但不包含class。 注:test_set与train_test中文章id的编号是独立的。
1. 数据处理
数据说明中显示:'article’是字级别上的,'word_seg’是词级别上的。
比赛举办方已经把单词给我们切好了,不需要自己手动分词(如用“结巴分词”等工具),而且他已经把单词数字化(脱敏),这其实也省了我们一些工作。
一般的比赛我们是要自己分词,而且分词的效果对模型和结果影响较大
1.1 导入数据
import pandas as pd
train=pd.read_csv('new_data/train_set.csv', sep=',',header=0)
test=pd.read_csv('new_data/test_set.csv', sep=',',header=0)
train.info()
test.info()
y_train=train['class']-1
df_train=train.drop('class', axis=1)
df_test=test
训练集的类别标签(有解释说减1方便计算),但我目前仍然不理解减1如何方便计算?,
2. 特征向量化
2.1 特性向量化 tfidf
采用sklearn中的TFIDF进行向量化,tfidf的介绍见上一篇blog
将数据集中的字符文本转换成数字向量,以便计算机能够进行处理(一段文字 —> 一个向量)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=0, sublinear_tf=1)
vectorizer.fit(df_train['word_seg'])
# df_all = pd.concat(objs=[df_train['word_seg'], df_test['word_seg']], axis=0)
# vectorizer.fit(df_all)
X_train = vectorizer.transform(df_train['word_seg'])
X_test = vectorizer.transform(df_test['word_seg'])
使用了df_all = pd.concat(objs=[df_train, df_test], axis=0),然后再用df_all进行fit,我觉得后面这种效果可能好一点,因为覆盖了全部的样本
#保存至本地
import pickle
data=(X_train,y_train,X_test)
fp=open('mid_files/data_tfidf.pkl','wb')
pickle.dump(data,fp)
fp.close
关键步骤后将数据保留,防止kernel 崩掉后一无所有,下次直接导入已处理好的文件即可。
2.2 将tfidf特征降维为lsa特征, 并保存
print("2.1 将tfidf特征降维为lsa特征")
from sklearn.decomposition import TruncatedSVD
print("0 读取tfidf特征")
f_tfidf = open('mid_files/data_tfidf.pkl', 'rb')
X_train, y_train, X_test = pickle.load(f_tfidf)
f_tfidf.close()
print("1 将tfidf特征降维为lsa特征")
lsa = TruncatedSVD(n_components=200)
X_train = lsa.fit_transform(X_train)
X_test = lsa.transform(X_test)
#2 将lsa特征保存至本地
print("2 将lsa特征保存至本地")
data = (x_train, y_train, x_test)
f_data = open('mid_files/data_tfidf_lsa.pkl', 'wb')
pickle.dump(data, f_data)
f_data.close()
2.3 将tfidf特征降维为lda特征, 并保存
print("2.2 将tfidf特征降维为lda特征")
from sklearn.decomposition import LatentDirichletAllocation
print("0 读取tfidf特征")
f_tfidf = open('mid_files/data_tfidf.pkl', 'rb')
X_train, y_train, X_test = pickle.load(f_tfidf)
f_tfidf.close()
print("1 特征降维:lda")
lda = LatentDirichletAllocation(n_components=200)
X_train = lda.fit_transform(X_train)
X_test = lda.transform(X_test)
print("2 将lda特征保存至本地")
data = (x_train, y_train, x_test)
f_data = open('mid_files/data_tfidf_lda.pkl', 'wb')
pickle.dump(data, f_data)
f_data.close()
3. 训练分类器
3.1. 线性支持向量分类LinearSVC()
from sklearn.svm import LinearSVC
print("模型训练")
classifier = LinearSVC()
classifier.fit(X_train,y_train)
4. 预测并保存文本
#根据上面训练好的分类器对测试集的每个样本进行预测
print("测试结果")
y_test = classifier.predict(X_test)
#将测试集的预测结果保存至本地
print("结果保存")
df_test['class'] = y_test.tolist()
df_test['class'] = df_test['class'] + 1
df_result = df_test.loc[:, ['id', 'class']]
df_result.to_csv('submission_LinearSVC.csv', index=False)
5. 改进:K 折交叉验证
针对每一个模型都进行5折交叉验证,目前可以用逻辑回归、LinearSVC、lightgbm、贝叶斯模型进行分析。并对K折预测的结果进行融合,融合策略采用投票机制。
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score
import lightgbm as lgb
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
import time
def train_model(X, X_test, y, folds,params=None, model_type='LSVC', plot_feature_importance=False):
n_fold=5
iteration=300
nrepeats = 1
prediction = np.zeros((X_test.shape[0], n_fold*nrepeats))
scores = []
feature_importance = pd.DataFrame()
fold_n=0
y_val_pred_stack=[]
y_val_stack=[]
for train_index, valid_index in folds.split(X, y):
print('Fold', fold_n, 'started at', time.ctime())
X_trn, X_val = X[trn_index], X[val_index]
y_trn, y_val = y[trn_index], y[val_index]
if model_type=='LSVC':
model= LinearSVC()
model.fit(X_train,y_train)
y_valid_pred=model.predict(X_valid)
y_pred=model.predict(X_test)
if model_type=='lr':
#3.使用逻辑斯蒂回归
model = LogisticRegression() # 初始化LogisticRegression
model.fit(X_trn,y_trn) # 使用训练集对测试集进行训练
y_val_pred=model.predict(X_val) # 使用逻辑回归函数对测试集进行预测
y_pred=model.predict(X_test)
if mode_type=='mnb':
model=MultinomialNB()
model.fit(X_trn,y_trn)
y_val_pred=model.predict(X_val) # 使用MultinomialNB函数对测试集进行预测
y_pred=model.predict(X_test)
if mode_type=='gnb':
model=GaussianNB()
model.fit(X_trn,y_trn)
y_val_pred=model.predict(X_val) # 使用GaussianNB函数对测试集进行预测
y_pred=model.predict(X_test)
if mode_type=='bnb':
model=BernoulliNB()
model.fit(X_trn,y_trn)
y_val_pred=model.predict(X_val) # 使用BernoulliNB函数对测试集进行预测
y_pred=model.predict(X_test)
if model_type == 'lgb':
trn_data = lgb.Dataset(X_trn, label=y_trn)
val_data = lgb.Dataset(X_val, label=y_val)
model = lgb.train(params, trn_data, iteration, valid_sets = [trn_data, val_data], feval=f1_score_valid,verbose_eval=5,early_stopping_rounds = 10)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_val_pred_stack.append(y_val_pred)
y_val_stack.append(y_val)
f1_scores.append(f1_score(np.array(y_val), y_val_pred,average='mocro'))
accuracy_scores.append(accuracy_score(np.array(y_val), y_val_pred,average='micro'))
prediction[:,fold_n]=y_pred
fold_n+=1
print('CV mean f1_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(f1_scores), np.std(f1_scores)))
print('CV mean accuracy_scores: {0:.4f}, std: {1:.4f}.'.format(np.mean(accuracy_scores), np.std(accuracy_scores)))
return prediction, y_val_pred_stack,y_val_stack
from sklearn.model_selection import RepeatedKFold
# Use Kfold predict
nfolds = 5
nrepeats = 2
iteration = 3000
#skf = StratifiedKFold(n_splits=5, random_state=1)
folds = RepeatedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=11)
#prediction=train_model(X_train, X_test, y_train, folds,params=None, model_type='LSVC', plot_feature_importance=False)
prediction=train_model(X_train, X_test, y_train, folds,params=None, model_type='lr', plot_feature_importance=False)
5.1 LinearSVC 模型交叉验证后的f1_score是0.7222
,如下图:
5.2 LR 模型交叉验证后的f1_score是0.6998,如下图:
5.3 lightgbm 模型交叉验证后的f1_score是0.7255,如下图:
在lightgbm模型中,针对多分类问题, ‘objective’应是’multiclass’; 也要写清楚 ‘num_class’:20, (有多少类,这里一共有20个类)
import lightgbm as lgb
params= {
'objective':'multiclass',
'num_class':19,
'boosting': 'gbdt',
'num_threads': 1,
'learning_rate': 0.3,
'num_leaves': 31,
'max_depth': 8,
'max_bin':200,
'lambda_l1': 0,
'lambda_l2': 0,
}
prediction=train_model(X_train, X_test, y_train, folds,params, model_type='lgb', plot_feature_importance=False)
5.5 RCNN:
# 分别对应文中的公式(1)-(7)
print('model')
forward = LSTM(256, return_sequences = True)(l_embedding) # 等式(1)
# 等式(2)
backward = LSTM(256, return_sequences = True, go_backwards = True)(r_embedding)together = concatenate([forward, doc_embedding, backward], axis = 2) # 等式(3)
semantic = TimeDistributed(Dense(128, activation = "tanh"))(together) # 等式(4)
# 等式(5)
pool_rnn = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (128, ))(semantic)output = Dense(19, activation = "softmax")(pool_rnn) # 等式(6)和(7)
model = Model(inputs = [document, left_context, right_context], outputs = output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([X_train_padded_seqs, left_train_padded_seqs, right_train_padded_seqs],y_train, batch_size=32, epochs=1, validation_data=([X_test_padded_seqs, left_test_padded_seqs, right_test_padded_seqs], y_test))
model.save(model_path + 'rcnn.h5')
还在跑中,,,
5.6 textCNN:
TextCNN的详细过程原理图见下:
TextCNN详细过程:第一层是图中最左边的7乘5的句子矩阵,每行是词向量,维度=5,这个可以类比为图像中的原始像素点了。然后经过有 filter_size=(2,3,4) 的一维卷积层,每个filter_size 有两个输出 channel。第三层是一个1-max pooling层,这样不同长度句子经过pooling层之后都能变成定长的表示了,最后接一层全连接的 softmax 层,输出每个类别的概率。
import tensorflow as tf
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
class TextCNN(object):
def __init__(self):
self.text_length = 600 # 文本长度
self.num_classer = 10 # 类别数
self.vocab_size = 5000 # 词汇表达小
self. word_vec_dim = 64 # 词向量维度
self.filter_width = 2 # 卷积核尺寸
self.filter_width_list = [2, 3, 4] # 卷积核尺寸列表
self.num_filters = 5 # 卷积核数目
self.dropout_prob = 0.5 # dropout概率
self.learning_rate = 0.005 # 学习率
self.iter_num = 10 # 迭代次数
self.batch_size = 64 # 每轮迭代训练多少数据
self.model_save_path = './model/' # 模型保存路径
self.model_name = 'textcnn_model' # 模型的命名
self.embedding = tf.get_variable('embedding', [self.vocab_size, self.word_vec_dim])
self.fc1_size = 32 # 第一层全连接的神经元个数
self.fc2_size = 64 # 第二层全连接的神经元个数
self.fc3_size = 10 # 第三层全连接的神经元个数
# 定义初始化网络权重函数
def get_weight(self, shape, regularizer):
w = tf.Variable(tf.truncated_normal(shape, stddev=0.1))
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(regularizer)(w)) # 为权重加入L2正则化
return w
# 定义初始化偏置项函数
def get_bias(self, shape):
b = tf.Variable(tf.ones(shape))
return b
# 生成批次数据
def batch_iter(self, x, y):
data_len = len(x)
num_batch = int((data_len - 1) / self.batch_size) + 1
indices = np.random.permutation(np.arange(data_len)) # 随机打乱一个数组
x_shuffle = x[indices] # 随机打乱数据
y_shuffle = y[indices] # 随机打乱数据
for i in range(num_batch):
start = i * self.batch_size
end = min((i + 1) * self.batch_size, data_len)
yield x_shuffle[start:end], y_shuffle[start:end]
# 模型1,使用多种卷积核
def model_1(self, x, is_train):
# embedding层
embedding_res = tf.nn.embedding_lookup(self.embedding, x)
pool_list = []
for filter_width in self.filter_width_list:
# 卷积层
conv_w = self.get_weight([filter_width, self.word_vec_dim, self.num_filters], 0.01)
conv_b = self.get_bias([self.num_filters])
conv = tf.nn.conv1d(embedding_res, conv_w, stride=1, padding='VALID')
conv_res = tf.nn.relu(tf.nn.bias_add(conv, conv_b))
# 最大池化层
pool_list.append(tf.reduce_max(conv_res, reduction_indices=[1]))
pool_res = tf.concat(pool_list, 1)
# 第一个全连接层
fc1_w = self.get_weight([self.num_filters * len(self.filter_width_list), self.fc1_size], 0.01)
fc1_b = self.get_bias([self.fc1_size])
fc1_res = tf.nn.relu(tf.matmul(pool_res, fc1_w) + fc1_b)
if is_train:
fc1_res = tf.nn.dropout(fc1_res, 0.5)
# 第二个全连接层
fc2_w = self.get_weight([self.fc1_size, self.fc2_size], 0.01)
fc2_b = self.get_bias([self.fc2_size])
fc2_res = tf.nn.relu(tf.matmul(fc1_res, fc2_w) + fc2_b)
if is_train:
fc2_res = tf.nn.dropout(fc2_res, 0.5)
# 第三个全连接层
fc3_w = self.get_weight([self.fc2_size, self.fc3_size], 0.01)
fc3_b = self.get_bias([self.fc3_size])
fc3_res = tf.matmul(fc2_res, fc3_w) + fc3_b
return fc3_res
# 训练
def train(cnn, X_train, y_train):
x = tf.placeholder(tf.int32, [None, cnn.text_length])
y = tf.placeholder(tf.float32, [None, cnn.num_classer])
y_pred = cnn.model_1(x, True)
# 声明一个全局计数器,并输出化为0,存放到目前为止模型优化迭代的次数
global_step = tf.Variable(0, trainable=False)
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y)
loss = tf.reduce_mean(cross_entropy)
# 优化器
train_step = tf.train.AdamOptimizer(learning_rate=cnn.learning_rate).minimize(loss, global_step=global_step)
saver = tf.train.Saver() # 实例化一个保存和恢复变量的saver
# 创建一个会话,并通过python中的上下文管理器来管理这个会话
with tf.Session() as sess:
# 初始化计算图中的变量
init_op = tf.global_variables_initializer()
sess.run(init_op)
# 通过checkpoint文件定位到最新保存的模型
ckpt = tf.train.get_checkpoint_state(cnn.model_save_path)
if ckpt and ckpt.model_checkpoint_path:
# 加载最新的模型
saver.restore(sess, ckpt.model_checkpoint_path)
# 循环迭代,每次迭代读取一个batch_size大小的数据
for i in range(cnn.iter_num):
batch_train = cnn.batch_iter(X_train, y_train)
for x_batch, y_batch in batch_train:
loss_value, step = sess.run([loss, train_step], feed_dict={x: x_batch, y: y_batch})
print('After %d training step(s), loss on training batch is %g.' % (i, loss_value))
saver.save(sess, os.path.join(cnn.model_save_path, cnn.model_name), global_step=global_step)
# 预测
def predict(cnn, X_test, y_test):
# 创建一个默认图,在该图中执行以下操作
# with tf.Graph.as_default():
x = tf.placeholder(tf.int32, [None, cnn.text_length])
y = tf.placeholder(tf.float32, [None, cnn.num_classer])
y_pred = cnn.model_1(x, False)
saver = tf.train.Saver() # 实例化一个保存和恢复变量的saver
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_pred, 1)) # 判断预测值和实际值是否相同
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # 求平均得到准确率
with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(cnn.model_save_path)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
# 根据读入的模型名字切分出该模型是属于迭代了多少次保存的
global_step = ckpt.model_checkpoint_path.split('/')[-1].split(' ')[-1]
# 计算出测试集上准确
accuracy_score = sess.run(accuracy, feed_dict={x: X_test, y: y_test})
print('After %s training step(s), test accuracy = %g' % (global_step, accuracy_score))
else:
print('No checkpoint file found')
return
if __name__ == '__main__':
text_length = 2000 # 文本长度
df_train=pd.read_csv('new_data/train_set.csv', sep=',',header=0)
df_test=pd.read_csv('new_data/test_set.csv', sep=',',header=0)
word_seg = df_train['word_seg']
label = df_train['class'] - 1
X_train, X_test, y_train, y_test = train_test_split(word_seg, label, test_size=0.1, random_state=42)
is_train = True
cnn = TextCNN()
if is_train:
train(cnn, X_train, y_train)
else:
predict(cnn, X_test, y_test)
还没跑出结果,,,
6.Ensemble Learning-模型融合
通过对多个单模型融合以提升整体性能。
6.1 voting
投票制即为,投票多者为最终的结果。例如一个分类问题,多个模型投票(当然可以设置权重)。最终投票数最多的类为最终被预测的类。
目前单个模型的交叉验证后,的输出结果是采用这种投票的方式
将5折交叉验证输出的结果,采用投票策略计算出最后的预测值,并将结果保存:
prediction_sub=[]
prediction=prediction.astype(np.int64)
for i in range(len(prediction)):
result_vote=np.argmax(np.bincount(prediction[i,:]))
prediction_sub.append(result_vote)
pd.DataFrame(prediction_sub).to_csv(model_type+'sub.csv',index=False)
6.2 Stacking
目前最好的结果是:(有个黄冠是啥意思(一脸茫然。。。))
更多模型融合的方法见以下链接:
https://blog.csdn.net/shine19930820/article/details/75209021#1-ensemble-learning-模型融合
参考链接
https://blog.csdn.net/GreatXiang888/article/details/82873435#_17
https://www.cnblogs.com/pythoner6833/p/9585168.html
https://zhuanlan.zhihu.com/p/25928551
https://blog.csdn.net/zh11403070219/article/details/88388936