中文文本分类实战--模型选取以及融合

最新推荐文章于 2024-06-24 12:03:00 发布

yyyybupt

最新推荐文章于 2024-06-24 12:03:00 发布

阅读量1.1k

点赞数

本文链接：https://blog.csdn.net/qq_41747565/article/details/103499066

版权

1.模型选取

模型选择上，我选取了Lightgbm，XGboost，CNN，RNN，LSTM模型，目前搭好了模型可以跑通，机器学习的准确率可以达到0.96，由于神经网络我只训练了一次，准确率没有机器学习高，也达到了0.92，后期我会在研究模型，也非常乐意一起讨论，先和大家分享下，我在github上更新了源码，大家可以关注下，如果要在本地跑通需要关注readme修改下数据集的位置

1.1 XGboost

def build_xgb(train_X,train_y,valid_X,valid_y=None,subsample=0.75):
    xgtrain=xgb.DMatrix(train_X,label=train_y)
    if valid_y is not None:
        xgvalid=xgb.DMatrix(valid_X,label=valid_y)
    else:
        xgvalid=None

    model_params={}
    # binary 0 or 1
    model_params['objective']='binary:logistic'
    # eta is the learning_rate, [default=0.3]
    model_params['eta']=0.3
    # depth of the tree, deeper more complex.
    model_params['max_depth']=6
    # 0 [default] print running message, 1 means silent mode
    model_params['silent']=1
    model_params['eval_metric']='auc'
    # will give up further partitioning [default=1]
    model_params['subsample']=subsample
    # subsample ration of columns when constructing each tree
    model_params['colsample_bytree']=subsample
    # random seed
    model_params['seed']=2019

    model_params=list(model_params.items())
    return xgtrain,xgvalid,model_params

def train_xgboost(xgtrain,xgvalid,model_params,num_rounds=500,patience=20):
    if xgvalid is not None:
        # watchlist what information should be printed. specify validation monitoring
        wachlist=[(xgtrain,'train'),(xgvalid,'test')]
        model=xgb.train(model_params,xgtrain,num_rounds,wachlist,early_stopping_rounds=patience)
    else:
        model=xgb.train(model_params,xgtrain,num_rounds)
    return model

1.2 lightgbm

def build_lgb(train_X,train_y,valid_X,valid_y):
    # create dataset for lightgbm
    lgb_train=lgb.Dataset(train_X,train_y)
    lgb_eval=lgb.Dataset(valid_X,valid_y,reference=lgb_train)
    # specify your configurations as a dict
    model_params={
        'boosting_type':'gbdt',
        'objective':'binary',
        'metrics':{'binary_logloss','auc'}, # 二进制对数损失
        'num_leaves':5,
        'max_depth':6,
        'min_data_in_leaf':450,
        'learning_rate':0.1,
        'feature_fraction':0.9,
        'bagging_fraction':0.95,
        'bagging_freq':5,
        'lambda_l1':1,
        'lambda_l2':0.001,  #l2越小正则程度越高
        'min_gain_to_split':0.2,
        'verbose':5,
        'is_unbalance':True
    }
    return lgb_train,lgb_eval,model_params

def train_lgb(lgb_train, lgb_eval, model_params):
    gbm=lgb.train(model_params,
                  lgb_train,
                  num_boost_round=10000,
                  valid_sets=lgb_eval,
                  early_stopping_rounds=500)
    return gbm

1.3 Attention

"""Attention Layer"""
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                              K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

1.4 CNN

"CNN Model"
def model_cnn(embedding_matrix):
    filter_sizes=[1,2,3,5]
    num_filters=36

    inp=Input(shape=(maxlen,))
    x=Embedding(max_features,embed_size,weights=[embedding_matrix])(inp)
    x=Reshape((maxlen,embed_size,1))(x)

    maxpool_pool=[]
    for i in range(len(filter_sizes)):
        conv=Conv2D(num_filters,kernel_size=(filter_sizes[i],embed_size),
                    kernel_initializer='he_normal',activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen-filter_sizes[i]+1,1))(conv))

    z=Concatenate(axis=1)(maxpool_pool)
    z=Flatten()(z)
    z=Dropout(0.1)(z)

    outp=Dense(1,activation="sigmoid")(z)

    model=Model(inputs=inp,outputs=outp)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

1.5 LSTM

"LSTM models"
def model_lstm_attn(embedding_matrix):
    inp=Input(shape=(maxlen,))
    x=Embedding(max_features,embed_size,weights=[embedding_matrix],trainable=False)(inp)
    x=Bidirectional(LSTM(128,return_sequences=True))(x)
    x=Bidirectional(LSTM(64,return_sequences=True))(x)
    x=Attention(maxlen)(x)
    x=Dense(64,activation="relu")(x)
    x=Dense(1,activation="sigmoid")(x)
    model=Model(inputs=inp,outputs=x)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

2. 模型融合

对CNN和LSTM我利用LR尝试得到两个模型的融合系数，这里即LR的回归系数，然后在验证集上与融合后的模型计算最佳阈值

2.1 计算融合系数

    outputs=[]

    pred_val_y,pred_test_y,best_score=train_pred_F1(model_cnn(embedding_word2vec_matrix),epochs=1)
    outputs.append([pred_val_y,pred_test_y,best_score,'2d CNN']) # 0.912718204488778 2d CNN

    pred_val_y,pred_test_y,best_score=train_pred_F1(model_lstm_attn(embedding_word2vec_matrix),epochs=1)
    outputs.append([pred_val_y,pred_test_y,best_score,'2 LSTM x/ attention'])   # 0.9282051282051282 2 LSTM x/ attention

    outputs.sort(key=lambda x:x[2])
    weights=[i for i in range(1,len(outputs)+1)]
    weights=[float(i)/sum(weights) for i in weights]

    for output in outputs:
        print(output[2],output[3])

    from sklearn.linear_model import LinearRegression
    X=np.asarray([outputs[i][0] for i in range(len(outputs))])
    print(X.shape)  # (2, 381, 1)
    X=X[...,0] # 只取最后一维的第零个元素，是对验证集的预测值
    print(X.shape)  # (2, 381)
    reg=LinearRegression().fit(X.T,val_y)
    print(reg.score(X.T,val_y),reg.coef_)   # 0.7642986141890105 [0.17936741 0.8483659 ]

2.2 计算阈值

    pred_val_y=np.sum([outputs[i][0]*reg.coef_[i] for i in range(len(outputs))],axis=0)

    thresholds=[]
    for thresh in np.arange(0.1,0.501,0.01):
        thresh=np.round(thresh,2)
        res=metrics.f1_score(val_y,(pred_val_y>thresh).astype(int))
        thresholds.append([thresh,res])
        print("F1 score at threshold {0} is {1}".format(thresh,res))

    thresholds.sort(key=lambda  x:x[1],reverse=True)
    best_thresh=thresholds[0][0]
    print("Best threshold: ",best_thresh)

2.3 测试集预测，保存结果并对预测效果做一展示

    pred_test_y=np.sum([outputs[i][1]*reg.coef_[i] for i in range(len(outputs))],axis=0)
    pred_test_y=(pred_test_y>best_thresh).astype(int)
    # test_df=pd.read_csv("data/test.csv",usecols=["qid"])
    out_df=pd.DataFrame({"id":test_df["id"].values})
    print(len(out_df))
    out_df['prediction']=pred_test_y
    out_df.to_csv("submission.csv",index=False)
    
    score_f1 = metrics.f1_score(test_y, (pred_test_y > best_thresh).astype(int))
    print("score_f1 : ",score_f1)
    score_acc = metrics.accuracy_score(test_y, (pred_test_y > best_thresh).astype(int))
    print("score_acc : ",score_acc)
    """score_f1 :  0.9172932330827067
    score_acc :  0.9142857142857143"""