数据分析挖掘实例-CSDN博客

本文链接：https://blog.csdn.net/weixin_43693446/article/details/140722412

1、导入数据

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("data\diabetes.csv", encoding='gbk')
data

2、数据清晰

data_dup = data.drop_duplicates()  # 去重
data_delete1 = data_dup[~data_dup['Outcome'].isnull()]  # 删除y值为空的数据
data_delete = data_delete1[~(data_dup['SkinThickness'] == 'unknown')] # 删除异常数据
data_delete`

3、数据补全

for feture in data_delete.columns:
    if data_delete[feture].dtypes == 'object':
        data_delete[feture].fillna(data_delete[feture].mode()[0], inplace=True)
    else:
        data_delete[feture].fillna(data_delete[feture].mean(), inplace=True)
data_delete.info()

4、数据编码

data_delete['Outcome'] = pd.get_dummies(data_delete['Outcome'],drop_first=True, dtype=int)
data_delete1 = pd.concat([data_delete.drop('health', axis=1), pd.get_dummies(data_delete['health'], dtype=int)], axis=1)
data_delete1['SkinThickness']=data_delete1['SkinThickness'].astype(int)
for feture in ['Glucose', 'BloodPressure', 'SkinThickness']:
    data_delete1[feture] = (data_delete1[feture]-data_delete1[feture].mean())/data_delete1[feture].std()

data_delete1

5、数据可视化

fig, ax = plt.subplots(2)
data_delete1['Age'].hist(ax=ax[0])
data_delete['Outcome'].hist(ax=ax[1])
fig.savefig('result1')

6、数据训练


X = data_delete1.drop(columns=['Outcome'])
y = data_delete1['Outcome']
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)
params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 9]
}

model = GridSearchCV(
    estimator=GradientBoostingClassifier(),
    cv=5,
    param_grid=params,
    scoring='f1',
    verbose=3
)

model.fit(X_train, y_train)

7、数据预测

y_pred = model.predict(X_val)
y_pred_p = model.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, y_pred_p)
print('f1值为%4f'%f1_score(y_pred, y_val))
print('auc值为%4f'%roc_auc_score(y_val, y_pred_p))

8、结果画图



precision, recall, _ = precision_recall_curve(y_val, y_pred_p)
cm = confusion_matrix(y_val, y_pred)
fig1, axis = plt.subplots(2)
axis[0].plot(fpr, tpr)
axis[0].set_title('ROC')
axis[1].plot(recall, precision)
axis[1].set_title('PR')
fig1.savefig('roc')
 
 
# 使用Seaborn的heatmap来画混淆矩阵
fig2 = plt.figure()
ax2 = sns.heatmap(cm, annot=True, fmt='d')
ax2.set_title('Confusion Matrix')
ax2.set_xlabel('Predicted label')
ax2.set_ylabel('True label')
fig2.savefig('confusion_matrix')