机器学习-knn

最新推荐文章于 2024-10-01 23:00:34 发布

the uzi

最新推荐文章于 2024-10-01 23:00:34 发布

阅读量184

点赞数

文章标签：机器学习 python 人工智能

本文链接：https://blog.csdn.net/Albert__Einstein/article/details/129409200

版权

数据挖掘专栏收录该内容

34 篇文章 8 订阅

订阅专栏

该文使用K近邻(KNN)算法对糖尿病数据集进行分类。首先加载数据，对数据特性进行解释，然后将数据分为训练集和测试集。通过调整K值，发现当K=4时，测试集的准确率最高。接着分析了各特征与目标变量的相关性，最后计算了ROC曲线和AUC值，评估模型性能。

摘要由CSDN通过智能技术生成

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot') # 风格

#Load the dataset
df = pd.read_csv('diabetes.csv') # 二维表格

#Print the first 5 rows of the dataframe.
df.head()
# Pregnancies 是否怀孕
# Glucose 血糖值  70~140 正常
# BloodPressure 血压  高血压: 收缩压≥140mmHg舒张压≥90mmHg 低血压：血压低于90/60mmHg
# SkinThickness 皮肤厚度 mm
# Insulin  两小时血清胰岛素 muU/ml
# BMI 体重指数 = 体重公斤/（身高米）^2 成人标准值是BMI18.5-23.9。
# DiabetesPedigreeFunction 糖尿病系统功能 
# AGE 年龄

# outCome 1 = 糖尿病 0没有糖尿病

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

# 我们有768行和9列。前8列表示特性，最后一列表示目标/Label。
df.shape

(768, 9)

y = df['Outcome'].values # label
X = df.drop('Outcome',axis=1).values

print(X[:10], y[:10])

[[6.000e+00 1.480e+02 7.200e+01 3.500e+01 0.000e+00 3.360e+01 6.270e-01
  5.000e+01]
 [1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
  3.100e+01]
 [8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
  3.200e+01]
 [1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01]
 [5.000e+00 1.160e+02 7.400e+01 0.000e+00 0.000e+00 2.560e+01 2.010e-01
  3.000e+01]
 [3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
  2.600e+01]
 [1.000e+01 1.150e+02 0.000e+00 0.000e+00 0.000e+00 3.530e+01 1.340e-01
  2.900e+01]
 [2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
  5.300e+01]
 [8.000e+00 1.250e+02 9.600e+01 0.000e+00 0.000e+00 0.000e+00 2.320e-01
  5.400e+01]] [1 0 1 0 1 0 1 0 1 1]

# 分割数据为训练集和测试集
# 训练集 = 0.8  测试集0.2
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

pCount = sum([x for x in y_train if x >= 1 ]) # 糖尿病数量
nCount = len(y_train) - pCount
print(pCount, nCount)

214 400

import matplotlib.pyplot as plt

labels = 'diabetes', 'health' # 标签
sizes = [pCount, nCount] # 取值
explode = (0, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        colors=["#d5695d", "#65a479"],
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aTCX3xZ8-1678272290850)(knn-for-classification_files/knn-for-classification_5_0.png)]

from sklearn.neighbors import KNeighborsClassifier

neighbors = np.arange(1,9) # k 取值范围
train_accuracy =np.empty(len(neighbors)) # 
test_accuracy = np.empty(len(neighbors))

# 测试不同k 精度情况
for i,k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k, algorithm='kd_tree')
    
    # 模型训练
    knn.fit(X_train, y_train)
    
    #训练集精度
    train_accuracy[i] = knn.score(X_train, y_train)
    
    #测试集精度
    test_accuracy[i] = knn.score(X_test, y_test) 
    print("k:%s, 训练集准确率:%s 测试集准确率:%s" % (k,train_accuracy[i], test_accuracy[i]))

plt.title('kNN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

k:1, 训练集准确率:1.0 测试集准确率:0.6623376623376623
k:2, 训练集准确率:0.8436482084690554 测试集准确率:0.7142857142857143
k:3, 训练集准确率:0.8566775244299675 测试集准确率:0.6948051948051948
k:4, 训练集准确率:0.8175895765472313 测试集准确率:0.7272727272727273
k:5, 训练集准确率:0.8045602605863192 测试集准确率:0.6688311688311688
k:6, 训练集准确率:0.7899022801302932 测试集准确率:0.7142857142857143
k:7, 训练集准确率:0.7817589576547231 测试集准确率:0.6753246753246753
k:8, 训练集准确率:0.7866449511400652 测试集准确率:0.7077922077922078

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DVXmkc8F-1678272290851)(knn-for-classification_files/knn-for-classification_6_1.png)]

# 特征选择 -皮尔森系数
# 皮尔逊相关系数
from scipy.stats import pearsonr
import pandas as pd

label = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
train_df = pd.DataFrame(X_train) 

# train_df[0]

# 返回值的第一项是皮尔森相关系数，第二项是p_value值。一般来说皮尔森相关系数越大，p_value越小，线性相关性就越大。
# X_train, y_train
for i in range(8):
    p = pearsonr(train_df[i], y_train)
    print('%s:%s'% (label[i], round(p[0],2)))

# del train_df[2]

Pregnancies:0.21
Glucose:0.48
BloodPressure:0.08
SkinThickness:0.09
Insulin:0.16
BMI:0.3
DiabetesPedigreeFunction:0.17
Age:0.24

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType

# https://gallery.pyecharts.org/#/Bar/bar_base

"""
体重指数 = 体重公斤/（身高米）
微瘦: <18.5
正常: 18.5~24.9
微重: 25.9~30
超重：>30
"""

p1 = sum([1 for x in train_df[5] if x <18.5 ] )
p2 = sum([1 for x in train_df[5] if x >=18.5 and x<=24.9 ] )
p3 = sum([1 for x in train_df[5] if x > 25.9 and x<= 30 ] )
p4 = sum([1 for x in train_df[5] if x > 30 ] )

print('训练集人数:%s' % len(train_df))

c = (
    #Bar()
    Bar({"theme": ThemeType.MACARONS})
    .add_xaxis( ["thin", "normal", "micro heavy", "overweight"])
    .add_yaxis("BMI 范围",  [p1, p2, p3, p4] )
    #.reversal_axis()
    .set_global_opts(
        title_opts=opts.TitleOpts(title="BMI 范围统计", subtitle="单位: 人"),
        brush_opts=opts.BrushOpts(),
    )
)

c.render_notebook()

训练集人数:614

    <div id="c530b4ff3d7f4f6b88a7310d2c4cb2a8" style="width:900px; height:500px;"></div>

# k = 4 时, 测试效果集准确率最高
knn = KNeighborsClassifier(n_neighbors=4, algorithm='kd_tree')

#模型训练
knn.fit(train_df, y_train)

#测试集精度
test_df = pd.DataFrame(X_test) 
score = knn.score(test_df, y_test) 
print(score)

0.7272727272727273

from sklearn.metrics import roc_curve

y_pred_proba = knn.predict_proba(test_df)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) # tpr 真正率和假正率

plt.plot([0,1],[0,1],'k--') # 绘制一条y = x 虚线
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=4) ROC curve')
plt.show()

# auc 取值 - roc 面积
from sklearn.metrics import roc_auc_score
print("the auc is:%s" % round(roc_auc_score(y_test,y_pred_proba),4)) # 保留4位小数