import warnings
warnings.filterwarnings(“ignore”)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
train = pd.read_csv(‘data/train_set.csv’)
test = pd.read_csv(‘data/test_set.csv’)
data = pd.concat([train,test])
feature = train.columns.tolist()
feature.remove(‘ID’)
feature.remove(‘y’)
object_columns = train.columns[train.dtypes == ‘object’].tolist()
num_columns = list(set(feature) - set(object_columns))
print (object_columns)
print (num_columns)
for col in object_columns:
data = pd.concat([data, pd.get_dummies(data[col], prefix=col+’_’)], axis=1)
data.drop(col, axis=1, inplace=True)
X_train = data[data[‘y’].notnull()]
X_test = data[data[‘y’].isnull()]
y_train = X_train[‘y’]
X_train.drop([‘ID’, ‘y’], axis=1, inplace=True)
result = pd.DataFrame({‘ID’: X_test[‘ID’]})
X_test.drop([‘ID’, ‘y’], axis=1, inplace=True)
scaler = StandardScaler()
scaler.fit(X_train[num_columns])
X_train[num_columns] = scaler.transform(X_train[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train) # 创建验证数据
将参数写成字典下形式
params = {
‘task’: ‘train’,
‘boosting_type’: ‘gbdt’,
‘objective’: ‘binary’,
‘metric’: {‘auc’},
‘learning_rate’: 0.01,
‘is_unbalance’: True,
‘random_state’: 0,
‘verbose’: 0
}
print(‘Start training…’)
clf = lgb.train(params,
lgb_train,
valid_sets = lgb_val,
num_boost_round = 10000,
verbose_eval = 100,
early_stopping_rounds = 100)
print(‘Start predicting…’)
预测数据集
y_pred = clf.predict(X_val, num_iteration=clf.best_iteration) #如果在训练期间启用了早期停止,可以通过best_iteration方式从最佳迭代中获得预测
评估模型
print(roc_auc_score(y_val, y_pred))
y_test = clf.predict(X_test)
result[‘pred’] = y_test
print (result.head())
result.to_csv(‘data/submission.csv’, index=False)