jupyter实现k近邻算法预测糖尿病

数据集采用的是CUI的糖尿病数据
代码如下

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\",message=\"numpy.ufunc size changed\")\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "#1计算输入样本点,到每个样本的距离\n",
    "def distance(arr1, arr2):\n",
    "    return np.sqrt(sum((arr1 - arr2) ** 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "#knn,没有常规机器学习的三步走:模型选择、模型评价、参数拟合。\n",
    "#knn,预测和训练合二为一\n",
    "def myKNN(X, y, test, k=3):\n",
    "    #1.计算输入样本点,到每个样本的距离\n",
    "    distances = [distance(test,x) for x in X]\n",
    "    #2.将距离值向量升序取前k个值序号\n",
    "    neighbors = np.argsort(distances)[:k]\n",
    "    #3.投票决定属于哪个类\n",
    "    counter = Counter(y[neighbors])\n",
    "    return counter.most_common()[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "import mglearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "diabetes=pd.read_csv('diabetes.csv')\n",
    "X=diabetes.values[:,0:8]\n",
    "Y=diabetes.values[:,8]\n",
    "# print('dataset shape {}'.format(diabetes_data.shape))\n",
    "# diabetes_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=66);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "#测试我们自己写的myKNN(X, y, test, k=3)\n",
    "#测试集有38个点,用循环的方式来计算\n",
    "#Python循环参考:distances = [distance(test,x) for x in X]\n",
    "predictions = [myKNN(X_train, y_train, test, k=20) for test in X_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7552083333333334"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "correct = np.count_nonzero((predictions == y_test) == True)\n",
    "correct/ len(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=1, n_neighbors=11, p=2,\n",
       "           weights='uniform')"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "knn = KNeighborsClassifier(n_neighbors=11) #创建分类器\n",
    "knn.fit(X_train, y_train) #虽然不做事"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test set score: 0.74\n"
     ]
    }
   ],
   "source": [
    "#预测,sklearn中的函数可以直接接受矩阵样本输入,不需要循环\n",
    "sklearn_predictions = knn.predict(X_test)\n",
    "#knn.predict返回值和我们不一样,所以不能套用我们的评估方法\n",
    "#sklearn_correct = np.count_nonzero((sklearn_predictions == y_test) == True)\n",
    "print(\"Test set score: {:.2f}\".format(np.mean(sklearn_predictions == y_test)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值