1.模型选取
模型选择上,我选取了Lightgbm,XGboost,CNN,RNN,LSTM模型,目前搭好了模型可以跑通,机器学习的准确率可以达到0.96,由于神经网络我只训练了一次,准确率没有机器学习高,也达到了0.92,后期我会在研究模型,也非常乐意一起讨论,先和大家分享下,我在github上更新了源码,大家可以关注下,如果要在本地跑通需要关注readme修改下数据集的位置
1.1 XGboost
def build_xgb(train_X,train_y,valid_X,valid_y=None,subsample=0.75):
xgtrain=xgb.DMatrix(train_X,label=train_y)
if valid_y is not None:
xgvalid=xgb.DMatrix(valid_X,label=valid_y)
else:
xgvalid=None
model_params={}
# binary 0 or 1
model_params['objective']='binary:logistic'
# eta is the learning_rate, [default=0.3]
model_params['eta']=0.3
# depth of the tree, deeper more complex.
model_params['max_depth']=6
# 0 [default] print running message, 1 means silent mode
model_params['silent']=1
model_params['eval_metric']='auc'
# will give up further partitioning [default=1]
model_params['subsample']=subsample
# subsample ration of columns when constructing each tree
model_params['colsample_bytree']=subsample
# random seed
model_params['seed']=2019
model_params=list(model_params.items())
return xgtrain,xgvalid,model_params
def train_xgboost(xgtrain,xgvalid,model_params,num_rounds=500,patience=20):
if xgvalid is not None:
# watchlist what information should be printed. specify validation monitoring
wachlist=[(xgtrain,'train'),(xgvalid,'test')]
model=xgb.train(model_params,xgtrain,num_rounds,wachlist,early_stopping_rounds=patience)
else:
model=xgb.train(model_params,xgtrain,num_rounds)
return model
1.2 lightgbm
def build_lgb(train_X,train_y,valid_X,valid_y):
# create dataset for lightgbm
lgb_train=lgb.Dataset(train_X,train_y)
lgb_eval=lgb.Dataset(valid_X,valid_y,reference=lgb_train)
# specify your configurations as a dict
model_params={
'boosting_type':'gbdt',
'objective':'binary',
'metrics':{'binary_logloss','auc'}, # 二进制对数损失
'num_leaves':5,
'max_depth':6,
'min_data_in_leaf':450,
'learning_rate':0.1,
'feature_fraction':0.9,
'bagging_fraction':0.95,
'bagging_freq':5,
'lambda_l1':1,
'lambda_l2':0.001, #l2越小正则程度越高
'min_gain_to_split':0.2,
'verbose':5,
'is_unbalance':True
}
return lgb_train,lgb_eval,model_params
def train_lgb(lgb_train, lgb_eval, model_params):
gbm=lgb.train(model_params,
lgb_train,
num_boost_round=10000,
valid_sets=lgb_eval,
early_stopping_rounds=500)
return gbm
1.3 Attention
"""Attention Layer"""
class Attention(Layer):
def __init__(self, step_dim,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight((input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight((input_shape[1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, input, input_mask=None):
return None
def call(self, x, mask=None):
features_dim = self.features_dim
step_dim = self.step_dim
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij)
a = K.exp(eij)
if mask is not None:
a *= K.cast(mask, K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
return input_shape[0], self.features_dim
1.4 CNN
"CNN Model"
def model_cnn(embedding_matrix):
filter_sizes=[1,2,3,5]
num_filters=36
inp=Input(shape=(maxlen,))
x=Embedding(max_features,embed_size,weights=[embedding_matrix])(inp)
x=Reshape((maxlen,embed_size,1))(x)
maxpool_pool=[]
for i in range(len(filter_sizes)):
conv=Conv2D(num_filters,kernel_size=(filter_sizes[i],embed_size),
kernel_initializer='he_normal',activation='elu')(x)
maxpool_pool.append(MaxPool2D(pool_size=(maxlen-filter_sizes[i]+1,1))(conv))
z=Concatenate(axis=1)(maxpool_pool)
z=Flatten()(z)
z=Dropout(0.1)(z)
outp=Dense(1,activation="sigmoid")(z)
model=Model(inputs=inp,outputs=outp)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
return model
1.5 LSTM
"LSTM models"
def model_lstm_attn(embedding_matrix):
inp=Input(shape=(maxlen,))
x=Embedding(max_features,embed_size,weights=[embedding_matrix],trainable=False)(inp)
x=Bidirectional(LSTM(128,return_sequences=True))(x)
x=Bidirectional(LSTM(64,return_sequences=True))(x)
x=Attention(maxlen)(x)
x=Dense(64,activation="relu")(x)
x=Dense(1,activation="sigmoid")(x)
model=Model(inputs=inp,outputs=x)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
return model
2. 模型融合
对CNN和LSTM我利用LR尝试得到两个模型的融合系数,这里即LR的回归系数,然后在验证集上与融合后的模型计算最佳阈值
2.1 计算融合系数
outputs=[]
pred_val_y,pred_test_y,best_score=train_pred_F1(model_cnn(embedding_word2vec_matrix),epochs=1)
outputs.append([pred_val_y,pred_test_y,best_score,'2d CNN']) # 0.912718204488778 2d CNN
pred_val_y,pred_test_y,best_score=train_pred_F1(model_lstm_attn(embedding_word2vec_matrix),epochs=1)
outputs.append([pred_val_y,pred_test_y,best_score,'2 LSTM x/ attention']) # 0.9282051282051282 2 LSTM x/ attention
outputs.sort(key=lambda x:x[2])
weights=[i for i in range(1,len(outputs)+1)]
weights=[float(i)/sum(weights) for i in weights]
for output in outputs:
print(output[2],output[3])
from sklearn.linear_model import LinearRegression
X=np.asarray([outputs[i][0] for i in range(len(outputs))])
print(X.shape) # (2, 381, 1)
X=X[...,0] # 只取最后一维的第零个元素,是对验证集的预测值
print(X.shape) # (2, 381)
reg=LinearRegression().fit(X.T,val_y)
print(reg.score(X.T,val_y),reg.coef_) # 0.7642986141890105 [0.17936741 0.8483659 ]
2.2 计算阈值
pred_val_y=np.sum([outputs[i][0]*reg.coef_[i] for i in range(len(outputs))],axis=0)
thresholds=[]
for thresh in np.arange(0.1,0.501,0.01):
thresh=np.round(thresh,2)
res=metrics.f1_score(val_y,(pred_val_y>thresh).astype(int))
thresholds.append([thresh,res])
print("F1 score at threshold {0} is {1}".format(thresh,res))
thresholds.sort(key=lambda x:x[1],reverse=True)
best_thresh=thresholds[0][0]
print("Best threshold: ",best_thresh)
2.3 测试集预测,保存结果并对预测效果做一展示
pred_test_y=np.sum([outputs[i][1]*reg.coef_[i] for i in range(len(outputs))],axis=0)
pred_test_y=(pred_test_y>best_thresh).astype(int)
# test_df=pd.read_csv("data/test.csv",usecols=["qid"])
out_df=pd.DataFrame({"id":test_df["id"].values})
print(len(out_df))
out_df['prediction']=pred_test_y
out_df.to_csv("submission.csv",index=False)
score_f1 = metrics.f1_score(test_y, (pred_test_y > best_thresh).astype(int))
print("score_f1 : ",score_f1)
score_acc = metrics.accuracy_score(test_y, (pred_test_y > best_thresh).astype(int))
print("score_acc : ",score_acc)
"""score_f1 : 0.9172932330827067
score_acc : 0.9142857142857143"""