scikit-learning k临近算法分类学习应用之肿瘤预测

k临近应用 : 肿瘤预测

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

# 加载数据
data = pd.read_csv('datasets/pima-indians-diabetes/diabetes.csv')
print('dataset shape {}'.format(data.shape))
#print(data.head(1))
#print(data.groupby("Outcome").size())
x = data.iloc[:, 0:8]
#print(x_train.head(1))
y = data.iloc[:, 8]
#print(y_train.head(2))
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)


neighbor_num = 2
#查看不同模型的评分
KNN_uniform = KNeighborsClassifier(n_neighbors=neighbor_num, weights="uniform")
KNN_uniform.fit(x_train, y_train)
print("KNN_uniform score : %f"%KNN_uniform.score(x_train, y_train))
KNN_distance = KNeighborsClassifier(n_neighbors=neighbor_num, weights="distance")
KNN_distance.fit(x_train, y_train)
print("KNN_distance score : %f"%KNN_distance.score(x_train, y_train))
#按照半径
KNN_Radius = RadiusNeighborsClassifier(n_neighbors=neighbor_num, radius=500.0)
KNN_Radius.fit(x_train, y_train)
print("KNN_Radius score : %f"%KNN_Radius.score(x_train, y_train))

#通过cross_val_score查看评分
fold = KFold(15)#将数据集分成15份
#这里cross_val_score会运行KNN_uniform15次,每次取一份数据作为交叉验证数据集
#每次综合训练集和交叉验证集得到一个评分
KNN_uniform_result = cross_val_score(KNN_uniform, x, y, cv=fold)
print("KNN_uniform score by cross_val_score : %f"%KNN_uniform_result.mean())

KNN_distance_result = cross_val_score(KNN_distance, x, y, cv=fold)
print("KNN_distance score by cross_val_score : %f"%KNN_distance_result.mean())

KNN_Radius_result = cross_val_score(KNN_Radius, x, y, cv=fold)
print("KNN_Radius score by cross_val_score : %f"%KNN_Radius_result.mean())



#加权平均评分最高,采用加权平均训练
KNN_uniform = KNeighborsClassifier(n_neighbors=neighbor_num, weights="uniform")
KNN_uniform.fit(x_train, y_train)
train_score = KNN_uniform.score(x_train, y_train)
test_score = KNN_uniform.score(x_test, y_test)
print("train score : %f, test score : %f"%(train_score, test_score))
#train score : 0.832248, test score : 0.740260
"""由于数据随机划分,每次执行结果可能不相同,但是总体上是,rain score较低,欠拟合,
test score 更低,预测准确性低 画出学习曲线"""


#画学习曲线函数,后面作为公共函数使用
def plot_learning_curve(plt, model, title, x, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.title(title)
    if ylim is not None:
        #y轴范围
        plt.ylim(ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        model, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    #生成网格线
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    #图例位置
    plt.legend(loc="upper left")
    return plt

knn = KNeighborsClassifier(n_neighbors=2)
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(10, 6))
plot_learning_curve(plt, knn, "KNN for Tumor prediction",x, y, ylim=(0.0, 1.01), cv=cv)


输出:
dataset shape (768, 9)
KNN_uniform score : 0.828990
KNN_distance score : 1.000000
KNN_Radius score : 0.631922
KNN_uniform score by cross_val_score : 0.708421
KNN_distance score by cross_val_score : 0.674610
KNN_Radius score by cross_val_score : 0.649749
train score : 0.828990, test score : 0.727273

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值