# _*_coding:utf-8 _*_
# @project:py_project
# @name:7372
# @date:2022/2/13 11:07
# @Author:Ly
import pandas as pd
import os
import numpy as np
import xgboost as xgb
from sklearn.metrics import recall_score,accuracy_score,roc_auc_score,f1_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
data_info=pd.ExcelFile(r'C:\Users\Administrator\Desktop\7372-1.xlsx')
df=data_info.parse('Sheet1')
# print(df.head(10))
# print(len(df['map_id'].unique()))
# print(df.columns)
# df['new_index']=df.apply(lambda row :str(row['map_id'])+str(row['lable']),axis=1)
# for ind in df['new_index'].unique():
# tmp=df.loc[df['new_index']==ind]
# print(tmp.describe())
# df_mean=df.groupby(['map_id','lable']).mean()
# df_std=df.groupby(['map_id','lable']).std()
# print(df_std)
mapid=pd.get_dummies(df['map_id'])
mapid.rename(columns={i: 'mapid_' + str(i) for i in mapid.columns}, inplace=True)
data=df.drop(columns=['informat_accont_id','fintime','gameseq','rankinggame','kartid','map_id'],axis=1)
data_new = pd.concat([mapid, data], axis=1)
x = data_new.loc[:, data_new.columns != "lable"]
fea_imp = x.columns
y = data_new.loc[:, 'lable']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=10, test_size=0.1,stratify=y)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, random_state=10, test_size=0.1,stratify=y_train)
y_train = y_train.values.reshape(-1, 1)
y_valid = y_valid.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=fea_imp)
dvalid = xgb.DMatrix(x_valid, label=y_valid, feature_names=fea_imp)
dtest = xgb.DMatrix(x_test, label=y_test, feature_names=fea_imp)
num_round = 100
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
param = {'max_depth': 3,
'learning_rate': 0.1,
'objective': 'binary:logistic', # 此默认参数与 XGBClassifier 不同
'booster': 'gbtree',
'gamma': 0,
'min_child_weight': 1,
'subsample': 1,
'colsample_bytree': 1,
'reg_alpha': 0,
'reg_lambda': 1,
'random_state': 2,
'eval_metric':'auc'}
xg = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=10)
#y_true, y_pred 是0,1值
from sklearn.metrics import precision_score
# calculate precision
y_prob_pred=xg.predict(dtest)
threshold=0.6
y_pred=[1 if x >threshold else 0 for x in y_prob_pred ]
y_true=y_test
# print(y_pred)
print(f'阈值为:{threshold}的precision_score---->:',precision_score(y_true, y_pred, average='macro')) # 0.2222222222222222
print(f'阈值为:{threshold}的recall_score------->:',recall_score(y_true, y_pred, average='macro')) # 0.2222222222222222
print(f'阈值为:{threshold}的f1_score----------->:',f1_score(y_true, y_pred, average='macro'))
print(f'阈值为:{threshold}的accuracy_score----->:',accuracy_score(y_true, y_pred))
xgb_auc=roc_auc_score(y_true,y_prob_pred) # y_gbm_pred 是预测的概率
print(f'阈值为:{threshold}的xgb_auc------------>:',xgb_auc)
threshold=0.65
y_pred=[1 if x >threshold else 0 for x in y_prob_pred ]
y_true=y_test
# print(y_pred)
print(f'阈值为:{threshold}的precision_score---->:',precision_score(y_true, y_pred, average='macro')) # 0.2222222222222222
print(f'阈值为:{threshold}的recall_score------->:',recall_score(y_true, y_pred, average='macro')) # 0.2222222222222222
print(f'阈值为:{threshold}的f1_score----------->:',f1_score(y_true, y_pred, average='macro'))
print(f'阈值为:{threshold}的accuracy_score----->:',accuracy_score(y_true, y_pred))
xgb_auc=roc_auc_score(y_true,y_prob_pred) # y_gbm_pred 是预测的概率
print(f'阈值为:{threshold}的xgb_auc------------>:',xgb_auc)
im1 = pd.DataFrame({'importance': xg.get_score(importance_type='gain').values(), 'var': xg.get_score(importance_type='gain').keys()})
im2 = pd.DataFrame({'importance': xg.get_score(importance_type='weight').values(), 'var': xg.get_score(importance_type='weight').keys()})
im = im1.sort_values(by='importance', ascending=False)
print(im)
im = im2.sort_values(by='importance', ascending=False)
print(im)
xgboost实例
于 2022-02-13 22:30:15 首次发布
本文详细介绍了如何使用Python库XGBoost进行机器学习实战,涵盖了数据预处理、模型训练、参数调优以及模型评估等关键步骤,帮助读者深入理解XGBoost的工作原理和应用技巧。
1255

被折叠的 条评论
为什么被折叠?



