1、导入数据
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("data\diabetes.csv", encoding='gbk')
data
2、数据清晰
data_dup = data.drop_duplicates() # 去重
data_delete1 = data_dup[~data_dup['Outcome'].isnull()] # 删除y值为空的数据
data_delete = data_delete1[~(data_dup['SkinThickness'] == 'unknown')] # 删除异常数据
data_delete`
3、数据补全
for feture in data_delete.columns:
if data_delete[feture].dtypes == 'object':
data_delete[feture].fillna(data_delete[feture].mode()[0], inplace=True)
else:
data_delete[feture].fillna(data_delete[feture].mean(), inplace=True)
data_delete.info()
4、数据编码
data_delete['Outcome'] = pd.get_dummies(data_delete['Outcome'],drop_first=True, dtype=int)
data_delete1 = pd.concat([data_delete.drop('health', axis=1), pd.get_dummies(data_delete['health'], dtype=int)], axis=1)
data_delete1['SkinThickness']=data_delete1['SkinThickness'].astype(int)
for feture in ['Glucose', 'BloodPressure', 'SkinThickness']:
data_delete1[feture] = (data_delete1[feture]-data_delete1[feture].mean())/data_delete1[feture].std()
data_delete1
5、数据可视化
fig, ax = plt.subplots(2)
data_delete1['Age'].hist(ax=ax[0])
data_delete['Outcome'].hist(ax=ax[1])
fig.savefig('result1')
6、数据训练
X = data_delete1.drop(columns=['Outcome'])
y = data_delete1['Outcome']
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)
params = {
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 5, 7, 9]
}
model = GridSearchCV(
estimator=GradientBoostingClassifier(),
cv=5,
param_grid=params,
scoring='f1',
verbose=3
)
model.fit(X_train, y_train)
7、数据预测
y_pred = model.predict(X_val)
y_pred_p = model.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, y_pred_p)
print('f1值为%4f'%f1_score(y_pred, y_val))
print('auc值为%4f'%roc_auc_score(y_val, y_pred_p))
8、结果画图
precision, recall, _ = precision_recall_curve(y_val, y_pred_p)
cm = confusion_matrix(y_val, y_pred)
fig1, axis = plt.subplots(2)
axis[0].plot(fpr, tpr)
axis[0].set_title('ROC')
axis[1].plot(recall, precision)
axis[1].set_title('PR')
fig1.savefig('roc')
# 使用Seaborn的heatmap来画混淆矩阵
fig2 = plt.figure()
ax2 = sns.heatmap(cm, annot=True, fmt='d')
ax2.set_title('Confusion Matrix')
ax2.set_xlabel('Predicted label')
ax2.set_ylabel('True label')
fig2.savefig('confusion_matrix')