今日任务--探索新的模型LightGBM
导入第三方库
# 导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm
from sklearn.model_selection import KFold
from lightgbm import log_evaluation, early_stopping
# 忽略警告
import warnings
warnings.filterwarnings('ignore')
# 解决画图中文字体显示的问题
plt.rcParams['font.sans-serif'] = ['SimSun', 'Times New Roman']
plt.rcParams['font.size'] = 10
plt.rcParams['axes.unicode_minus'] = False
数据预处理
# 读取训练集和测试集文件
path = 'E:/编程/用户新增预测大赛'
train_data = pd.read_csv(path + '/train.csv')
test_data = pd.read_csv(path + '/test.csv')
特征工程
# 提取udmap特征,人工进行onehot
def udmap_onethot(d):
v = np.zeros(9)
if d == 'unknown':
return v
d = eval(d)
for i in range(1, 10):
if 'key' + str(i) in d:
v[i - 1] = d['key' + str(i)]
return v
train_udmap_df = pd.DataFrame(np.vstack(train_data['udmap'].apply(udmap_onethot)))
test_udmap_df = pd.DataFrame(np.vstack(test_data['udmap'].apply(udmap_onethot)))
train_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
test_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
# udmap特征和原始数据拼接
train_data = pd.concat([train_data, train_udmap_df], axis=1)
test_data = pd.concat([test_data, test_udmap_df], axis=1)
# 提取eid的频次特征
train_data['eid_freq'] = train_data['eid'].map(train_data['eid'].value_counts())
test_data['eid_freq'] = test_data['eid'].map(train_data['eid'].value_counts())
# 提取eid的标签特征
train_data['eid_mean'] = train_data['eid'].map(train_data.groupby('eid')['target'].mean())
test_data['eid_mean'] = test_data['eid'].map(train_data.groupby('eid')['target'].mean())
train_data['eid_std'] = train_data['eid'].map(train_data.groupby('eid')['target'].std())
test_data['eid_std'] = test_data['eid'].map(train_data.groupby('eid')['target'].std())
# 提取x1~x8的频次特征和标签特征
for i in range(1, 9):
train_data['x' + str(i) + '_freq'] = train_data['x' + str(i)].map(train_data['x' + str(i)].value_counts())
test_data['x' + str(i) + '_freq'] = test_data['x' + str(i)].map(train_data['x' + str(i)].value_counts())
train_data['x' + str(i) + '_mean'] = train_data['x' + str(i)].map(train_data.groupby('x' + str(i))['target'].mean())
test_data['x' + str(i) + '_mean'] = test_data['x' + str(i)].map(train_data.groupby('x' + str(i))['target'].mean())
# 提取key1~key9的频次特征和标签特征
for i in range(1, 10):
train_data['key'+str(i)+'_freq'] = train_data['key'+str(i)].map(train_data['key'+str(i)].value_counts())
test_data['key'+str(i)+'_freq'] = test_data['key'+str(i)].map(train_data['key'+str(i)].value_counts())
train_data['key'+str(i)+'_mean'] = train_data['key'+str(i)].map(train_data.groupby('key'+str(i))['target'].mean())
test_data['key'+str(i)+'_mean'] = test_data['key'+str(i)].map(train_data.groupby('key'+str(i))['target'].mean())
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)
# 时间特征处理
train_data['common_ts'] = pd.to_datetime(train_data['common_ts'], unit='ms')
test_data['common_ts'] = pd.to_datetime(test_data['common_ts'], unit='ms')
train_data['common_ts_hour'] = train_data['common_ts'].dt.hour
test_data['common_ts_hour'] = test_data['common_ts'].dt.hour
train_data['common_ts_day'] = train_data['common_ts'].dt.day
test_data['common_ts_day'] = test_data['common_ts'].dt.day
train_data['common_ts_minute'] = train_data['common_ts'].dt.minute
test_data['common_ts_minute'] = test_data['common_ts'].dt.minute
train_data['common_ts_second'] = train_data['common_ts'].dt.second
test_data['common_ts_second'] = test_data['common_ts'].dt.second
train_data['common_ts_dayofweek'] = train_data['common_ts'].dt.dayofweek
test_data['common_ts_dayofweek'] = test_data['common_ts'].dt.dayofweek
# 编码udmap是否为空
train_data['udmap_isunknown'] = (train_data['udmap'] == 'unknown').astype(int)
test_data['udmap_isunknown'] = (test_data['udmap'] == 'unknown').astype(int)
# 选取特征
train = train_data.drop([
'x3_freq', 'x6_freq', 'x8_freq',
'x3_mean', 'x6_mean', 'x8_mean',
'key4_freq', 'key5_freq', 'key7_freq', 'key8_freq', 'key9_freq',
'key4_mean', 'key5_mean', 'key7_mean', 'key8_mean', 'key9_mean',
'udmap', 'common_ts', 'uuid', 'target',
'x3', 'x6', 'x8',
'eid',
'key4', 'key5', 'key6', 'key7', 'key8', 'key9'], axis=1)
train.astype(np.float32).isna().sum(axis=0)
test = test_data.drop(['x3_freq', 'x6_freq', 'x8_freq',
'x3_mean', 'x6_mean', 'x8_mean',
'key4_freq', 'key5_freq', 'key7_freq', 'key8_freq', 'key9_freq',
'key4_mean', 'key5_mean', 'key7_mean', 'key8_mean', 'key9_mean',
'udmap', 'common_ts', 'uuid',
'x3', 'x6', 'x8',
'eid',
'key4', 'key5', 'key6', 'key7', 'key8', 'key9'], axis=1)
train_label = train_data['target']
值得一提的是,经过笔记(五)的特征重要性可视化后,发现时间特征的重要性比重较大,于是增大提取量,共得到秒、分钟、小时、天、星期五种特征。
构建LightGBM模型
# 使用Lightgbm方法训练数据集,使用5折交叉验证的方法获得5个测试集预测结果
def select_by_lgb(train_data, train_label, test_data, random_state=2023, n_splits=5, metric='auc', num_round=9995):
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
fold = 0
result = []
for train_idx, val_idx in kfold.split(train_data):
random_state += 1
train_x = train_data.loc[train_idx]
train_y = train_label.loc[train_idx]
test_x = train_data.loc[val_idx]
test_y = train_label.loc[val_idx]
clf = lightgbm
train_matrix = clf.Dataset(train_x, label=train_y)
test_matrix = clf.Dataset(test_x, label=test_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'num_leaves': 63,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'learning_rate': 0.1,
'metric': metric,
'seed': 2023,
'nthread': -1,
'verbose': -1
}
callbacks = [log_evaluation(period=500), early_stopping(stopping_rounds=500)]
model = clf.train(params, train_matrix, num_round, valid_sets=test_matrix, callbacks=callbacks)
pre_y = model.predict(test_data)
result.append(pre_y)
fold += 1
return result
参数已调好,无须改动。
训练模型
# 训练模型
test_data = select_by_lgb(train, train_label, test)
# test_data就是5折交叉验证中5次预测的结果
pre_y = pd.DataFrame(test_data).T
# 将5次预测的结果求取平均值,当然也可以使用其他的方法
pre_y['averge'] = pre_y[[i for i in range(5)]].mean(axis=1)
# 因为竞赛需要你提交最后的预测判断,而模型给出的预测结果是概率,因此我们认为概率>0.5的即该患者有糖尿病,概率<=0.5的没有糖尿病
pre_y['label'] = pre_y['averge'].apply(lambda x: 1 if x > 0.5 else 0)
将训练集特征、标签(目标)字段、测试集特征依次输入即可。
结果写入
# 读取保存结果的文件
result = pd.read_csv('submit.csv')
# 预测结果写入目标字段
result['target'] = pre_y['label']
# 写入保存结果的文件
result.to_csv('submit.csv', index=False)
视目标字段名自行更改。
结果提交
分数较决策树模型有了约0.03分的提升。