import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot') # 风格
#Load the dataset
df = pd.read_csv('diabetes.csv') # 二维表格
#Print the first 5 rows of the dataframe.
df.head()
# Pregnancies 是否怀孕
# Glucose 血糖值 70~140 正常
# BloodPressure 血压 高血压: 收缩压≥140mmHg舒张压≥90mmHg 低血压:血压低于90/60mmHg
# SkinThickness 皮肤厚度 mm
# Insulin 两小时血清胰岛素 muU/ml
# BMI 体重指数 = 体重公斤/(身高米)^2 成人标准值是BMI18.5-23.9。
# DiabetesPedigreeFunction 糖尿病系统功能
# AGE 年龄
# outCome 1 = 糖尿病 0没有糖尿病
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
# 我们有768行和9列。前8列表示特性,最后一列表示目标/Label。
df.shape
(768, 9)
y = df['Outcome'].values # label
X = df.drop('Outcome',axis=1).values
print(X[:10], y[:10])
[[6.000e+00 1.480e+02 7.200e+01 3.500e+01 0.000e+00 3.360e+01 6.270e-01
5.000e+01]
[1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
3.100e+01]
[8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
3.200e+01]
[1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
2.100e+01]
[0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
3.300e+01]
[5.000e+00 1.160e+02 7.400e+01 0.000e+00 0.000e+00 2.560e+01 2.010e-01
3.000e+01]
[3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
2.600e+01]
[1.000e+01 1.150e+02 0.000e+00 0.000e+00 0.000e+00 3.530e+01 1.340e-01
2.900e+01]
[2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
5.300e+01]
[8.000e+00 1.250e+02 9.600e+01 0.000e+00 0.000e+00 0.000e+00 2.320e-01
5.400e+01]] [1 0 1 0 1 0 1 0 1 1]
# 分割数据为训练集和测试集
# 训练集 = 0.8 测试集0.2
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)
pCount = sum([x for x in y_train if x >= 1 ]) # 糖尿病数量
nCount = len(y_train) - pCount
print(pCount, nCount)
214 400
import matplotlib.pyplot as plt
labels = 'diabetes', 'health' # 标签
sizes = [pCount, nCount] # 取值
explode = (0, 0.1)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
colors=["#d5695d", "#65a479"],
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aTCX3xZ8-1678272290850)(knn-for-classification_files/knn-for-classification_5_0.png)]
from sklearn.neighbors import KNeighborsClassifier
neighbors = np.arange(1,9) # k 取值范围
train_accuracy =np.empty(len(neighbors)) #
test_accuracy = np.empty(len(neighbors))
# 测试不同k 精度情况
for i,k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')
# 模型训练
knn.fit(X_train, y_train)
#训练集精度
train_accuracy[i] = knn.score(X_train, y_train)
#测试集精度
test_accuracy[i] = knn.score(X_test, y_test)
print("k:%s, 训练集准确率:%s 测试集准确率:%s" % (k,train_accuracy[i], test_accuracy[i]))
plt.title('kNN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()
k:1, 训练集准确率:1.0 测试集准确率:0.6623376623376623
k:2, 训练集准确率:0.8436482084690554 测试集准确率:0.7142857142857143
k:3, 训练集准确率:0.8566775244299675 测试集准确率:0.6948051948051948
k:4, 训练集准确率:0.8175895765472313 测试集准确率:0.7272727272727273
k:5, 训练集准确率:0.8045602605863192 测试集准确率:0.6688311688311688
k:6, 训练集准确率:0.7899022801302932 测试集准确率:0.7142857142857143
k:7, 训练集准确率:0.7817589576547231 测试集准确率:0.6753246753246753
k:8, 训练集准确率:0.7866449511400652 测试集准确率:0.7077922077922078
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DVXmkc8F-1678272290851)(knn-for-classification_files/knn-for-classification_6_1.png)]
# 特征选择 -皮尔森系数
# 皮尔逊相关系数
from scipy.stats import pearsonr
import pandas as pd
label = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
train_df = pd.DataFrame(X_train)
# train_df[0]
# 返回值的第一项是皮尔森相关系数,第二项是p_value值。一般来说皮尔森相关系数越大,p_value越小,线性相关性就越大。
# X_train, y_train
for i in range(8):
p = pearsonr(train_df[i], y_train)
print('%s:%s'% (label[i], round(p[0],2)))
# del train_df[2]
Pregnancies:0.21
Glucose:0.48
BloodPressure:0.08
SkinThickness:0.09
Insulin:0.16
BMI:0.3
DiabetesPedigreeFunction:0.17
Age:0.24
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType
# https://gallery.pyecharts.org/#/Bar/bar_base
"""
体重指数 = 体重公斤/(身高米)
微瘦: <18.5
正常: 18.5~24.9
微重: 25.9~30
超重:>30
"""
p1 = sum([1 for x in train_df[5] if x <18.5 ] )
p2 = sum([1 for x in train_df[5] if x >=18.5 and x<=24.9 ] )
p3 = sum([1 for x in train_df[5] if x > 25.9 and x<= 30 ] )
p4 = sum([1 for x in train_df[5] if x > 30 ] )
print('训练集人数:%s' % len(train_df))
c = (
#Bar()
Bar({"theme": ThemeType.MACARONS})
.add_xaxis( ["thin", "normal", "micro heavy", "overweight"])
.add_yaxis("BMI 范围", [p1, p2, p3, p4] )
#.reversal_axis()
.set_global_opts(
title_opts=opts.TitleOpts(title="BMI 范围统计", subtitle="单位: 人"),
brush_opts=opts.BrushOpts(),
)
)
c.render_notebook()
训练集人数:614
<div id="c530b4ff3d7f4f6b88a7310d2c4cb2a8" style="width:900px; height:500px;"></div>
# k = 4 时, 测试效果集准确率最高
knn = KNeighborsClassifier(n_neighbors=4, algorithm='kd_tree')
#模型训练
knn.fit(train_df, y_train)
#测试集精度
test_df = pd.DataFrame(X_test)
score = knn.score(test_df, y_test)
print(score)
0.7272727272727273
from sklearn.metrics import roc_curve
y_pred_proba = knn.predict_proba(test_df)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) # tpr 真正率和假正率
plt.plot([0,1],[0,1],'k--') # 绘制一条y = x 虚线
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=4) ROC curve')
plt.show()
# auc 取值 - roc 面积
from sklearn.metrics import roc_auc_score
print("the auc is:%s" % round(roc_auc_score(y_test,y_pred_proba),4)) # 保留4位小数
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-J3EKWrz5-1678272290851)(knn-for-classification_files/knn-for-classification_10_0.png)]
the auc is:0.6989