数据集采用的是CUI的糖尿病数据
代码如下
{
"cells": [
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\",message=\"numpy.ufunc size changed\")\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"#1计算输入样本点,到每个样本的距离\n",
"def distance(arr1, arr2):\n",
" return np.sqrt(sum((arr1 - arr2) ** 2))"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"#knn,没有常规机器学习的三步走:模型选择、模型评价、参数拟合。\n",
"#knn,预测和训练合二为一\n",
"def myKNN(X, y, test, k=3):\n",
" #1.计算输入样本点,到每个样本的距离\n",
" distances = [distance(test,x) for x in X]\n",
" #2.将距离值向量升序取前k个值序号\n",
" neighbors = np.argsort(distances)[:k]\n",
" #3.投票决定属于哪个类\n",
" counter = Counter(y[neighbors])\n",
" return counter.most_common()[0][0]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import mglearn"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"diabetes=pd.read_csv('diabetes.csv')\n",
"X=diabetes.values[:,0:8]\n",
"Y=diabetes.values[:,8]\n",
"# print('dataset shape {}'.format(diabetes_data.shape))\n",
"# diabetes_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=66);"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"#测试我们自己写的myKNN(X, y, test, k=3)\n",
"#测试集有38个点,用循环的方式来计算\n",
"#Python循环参考:distances = [distance(test,x) for x in X]\n",
"predictions = [myKNN(X_train, y_train, test, k=20) for test in X_test]"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7552083333333334"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"correct = np.count_nonzero((predictions == y_test) == True)\n",
"correct/ len(y_test)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=1, n_neighbors=11, p=2,\n",
" weights='uniform')"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"knn = KNeighborsClassifier(n_neighbors=11) #创建分类器\n",
"knn.fit(X_train, y_train) #虽然不做事"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Test set score: 0.74\n"
]
}
],
"source": [
"#预测,sklearn中的函数可以直接接受矩阵样本输入,不需要循环\n",
"sklearn_predictions = knn.predict(X_test)\n",
"#knn.predict返回值和我们不一样,所以不能套用我们的评估方法\n",
"#sklearn_correct = np.count_nonzero((sklearn_predictions == y_test) == True)\n",
"print(\"Test set score: {:.2f}\".format(np.mean(sklearn_predictions == y_test)))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}