1. 训练lightgbm模型
1.1 参数定义
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import datetime
import matplotlib.pyplot as plt
# 模型特征
feature_list = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5']
# 类别型特征
category_feature_list = ['feature1', 'feature2']
# 模型参数
params = {
'boosting_type': 'gbdt',
#'boosting': 'dart',
'objective': 'binary',
'metric': {'binary_logloss', 'auc'},
# 'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 10,
'max_depth': 5,
'max_bin': 10,
'min_data_in_leaf': 8,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'lambda_l1': 0,
'lambda_l2': 75,
'min_split_gain': 0,
'boost_from_average': False,
'is_unbalance': True,
'num_trees': 1,
'verbose': 0
}
1.2 数据预处理
data = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
X,y = data[feature_list].values, data['label'].values
# X, y = data.loc[:, feature_list], data.loc[:, 'label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, stratify=y_test)
# 数据转换
lgb_train = lgb.Dataset(data=X_train, label=y_train, categorical_feature=category_feature_list)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=category_feature_list)
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
1.3 模型训练
gbm = lgb.train(params,
train_set=lgb_train,
valid_sets=lgb_eval,
# valid_sets=[lgb_train, lgb_eval]
# categorical_feature=category_feature_list,
num_boost_round=100,
early_stopping_rounds=50)
print(gbm.best_iteration) # 最佳迭代次数
gbm.save_model('../models/gbm.model')
或
def lgb_test(train_x, train_y, test_x, test_y):
from multiprocessing import cpu_count
model = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=2, n_estimators=800,max_features = 140, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,
num_iterations = 800 #迭代次数
)
model.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
print(model.n_features_)
print(clf.best_score_['valid_1']['auc'])
return model
1.4 其他
1. 给样本增加权重
2. 计算woe
3. 决策树可视化
2. 对测试集打分
import pandas as pd
import lightgbm as lgb
import sys
THRESHOLD = 0.28 # 设定截断阈值
data_input_path = sys.argv[1] # 数据输入路径
data_output_path = sys.argv[2] # 数据输出路径
model_file_path = sys.argv[3] # 模型路径
# 1.加载测试数据
test_data = pd.read_csv(data_input_path, sep='\t')
# 2.加载模型
lgb_model = lgb.Booster(model_file=model_file_path)
# 3.特征选择
feature_names = [item for item in test_data.columns if 'open_' in item] + ['up_cnt']
# 4.模型预测
test_data['score'] = lgb_model.predict(test_data.loc[:, feature_names])
# y_preds = lgb_model.predict(test_data.loc[:, feature_names], num_iteration=gbm.best_iteration)
# y_test = y_test.astype(np.float64)
# print('log_loss', metrics.log_loss(y_test, y_preds))
# 5.打分后保留字段
keep_cols = ['phone', 'score', 'if_333']
# 6.根据阈值截断保留样本
result = test_data.loc[test_data['score'] >= THRESHOLD, keep_cols]
# 7.样本shuffle
result = result.sample(frac=1, random_state=1024)
# 8.根据某一字段筛选样本
result = result[result['if_333'] == 0]
# print(result.score.min())
# 9.打分结果保存到csv文件,(index=0,忽略索引;header=0,忽略表头)
result.to_csv(data_output_path, index=0, header=0, sep='\t', float_format='%.4f')