目录
一、流程
KNN基本算法:
1.计算距离
2.递增排序
3.取K (奇数)个最近点
4.通过简单多数或其他规则确定分类
二、手动实现KNN-性别判断
任务描述
本关任务:根据KNN原理,通过“属性分配”的49个数据(不包含你自己)进行训练,分别令K=5、7、9,判断你的性别。
import numpy as np
import pandas as pd
import csv
import warnings
warnings.filterwarnings("ignore")
f=pd.read_csv(r'/data/workspace/myshixun/step2/data.csv')
f=np.array(f)
data=f[1:,1:6]
label=f[1:,7]
#print(data,label)
def classify(inX,data,labels,k):
dataSetSize = data.shape[0] # 计算共有多少条训练数据
print(dataSetSize)
print('复制输入向量 用于和样本中的每条数据进行计算 [矩阵的加减乘除]')
print(tile(inX, (dataSetSize, 1)))
print(dataSetSize)
print('复制输入向量 用于和样本中的每条数据进行计算 [矩阵的加减乘除]')
print(tile(inX, (dataSetSize, 1)))
# 矩阵的减法 结果:每一项为输入向量和各个样本对应特征点的差值构成的新矩阵
diffmat = tile(inX, (dataSetSize, 1)) - data
print('\n相减后:')
print(diffmat)
sqDiffMat = diffmat ** 2 # 平方 矩阵中每一项都平方
print('\n平方后:')
print(sqDiffMat)
sqDistances = sqDiffMat.sum(axis=1) # axis=1 行向量相加 / axis=0 列向量相加
print('\n各个特征点差值相加[即坐标差值相加]:')
print(sqDistances)
distances = sqDistances ** 0.5 # 开方
print('\n距离:')
print(distances)
sortedDistIndexes = distances.argsort() # 从小到大将距离向量排序并返回其索引值
classCount = {} # dict 保存对应标签出现的次数
for i in range(k):
voteLabel = labels[sortedDistIndexes[i]] #获得类别标签
classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
print('标签出现的次数:')
print(json.dumps(classCount, ensure_ascii=False))
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
print('\n排序后:')
print(json.dumps(sortedClassCount, ensure_ascii=False))
# 如: print sortedClassCount ———— [('A', 2), ('B', 1)]
return sortedClassCount[0][0] # 返回次数出现次数最多的标签
调库实现KNN-性别判断
#encoding=utf8
import numpy as np
import student #模块名
'''
print(dir(student))
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__',
'__name__', '__package__', '__spec__', 'np', 'sigmoid']
注意:删掉if __name__ == "__main__"函数!
'''
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier as knn
import warnings
warnings.filterwarnings("ignore")
#使用python的pandas包对数据进行处理
f=pd.read_csv(r'/data/workspace/myshixun/step2/data.csv')
f=np.array(f)
x=f[1:,1:6]
y=f[1:,7]
#对于y而言,本身只有male和female两个属性,因此,我们使用一个0-1变量(哑变量)进行替代
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)
#也要划分好训练集和测试机。因此要对其进行打乱操作
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(x_train)
x_train = scaler1.transform(x_train)
x_test = scaler1.transform(x_test)
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
该样本空间里有宅男和文艺青年这两个类别(手动)
#encoding=utf8
import numpy as np
from collections import Counter
class kNNClassifier(object):
def __init__(self, k):
'''
初始化函数
:param k:kNN算法中的k
'''
self.k = k
# 用来存放训练数据,类型为ndarray
self.train_feature = None
# 用来存放训练标签,类型为ndarray
self.train_label = None
def fit(self, feature, label):
'''
kNN算法的训练过程
:param feature: 训练集数据,类型为ndarray
:param label: 训练集标签,类型为ndarray
:return: 无返回
'''
#********* Begin *********#
self.train_feature = feature
self.train_label = label
self.train_vars = feature.shape[0]#计算个数
#********* End *********#
def predict(self, feature):
'''
kNN算法的预测过程
:param feature: 测试集数据,类型为ndarray
:return: 预测结果,类型为ndarray或list
'''
#********* Begin *********#
result = []
for data in feature:
distance = self.calculateDistance(data) #计算距离
KLabels = self.getKLabels(distance)
result.append(self.getAppearMostLabel(KLabels))
return result
# 计算距离
def calculateDistance(self, feature):
diffMat = np.tile(feature, (self.train_vars, 1)) - self.train_feature
sqDistance = (diffMat ** 2).sum(axis=1)
return sqDistance ** 0.5
# 递增排序
def getKLabels(self, distance):
argOder = distance.argsort()[0:self.k]
return (self.train_label[i] for i in argOder)
# 取K (奇数)个最近点
def getAppearMostLabel(self, KLabels):
label, count = Counter(KLabels).most_common(1)[0]
return label
#********* End *********#
红酒分类(调库knn)
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
def classification(train_feature, train_label, test_feature):
'''
对test_feature进行红酒分类
:param train_feature: 训练集数据,类型为ndarray
:param train_label: 训练集标签,类型为ndarray
:param test_feature: 测试集数据,类型为ndarray
:return: 测试集数据的分类结果
'''
#********* Begin *********#
#实例化StandardScaler函数
scaler = StandardScaler()
# 划分测试集和训练集
train_feature = scaler.fit_transform(np.array(train_feature).reshape(133,13))
test_feature = scaler.transform(np.array(test_feature).reshape(45,13))
#生成K近邻分类器
clf = KNeighborsClassifier()
#训练分类器
clf.fit(train_feature, train_label.astype('int'))
#进行预测
predict_result = clf.predict(test_feature)
return predict_result
#********* End **********#
红酒5关
使用sklearn中的kNN算法进行分类
from sklearn.neighbors import KNeighborsClassifier
def classification(train_feature, train_label, test_feature):
'''
使用KNeighborsClassifier对test_feature进行分类
:param train_feature: 训练集数据
:param train_label: 训练集标签
:param test_feature: 测试集数据
:return: 测试集预测结果
'''
#********* Begin *********#
clf=KNeighborsClassifier()
clf.fit(train_feature, train_label)
return clf.predict(test_feature)
#********* End *********#
使用sklearn中的kNN算法进行回归
from sklearn.neighbors import KNeighborsRegressor
def regression(train_feature, train_label, test_feature):
'''
使用KNeighborsRegressor对test_feature进行分类
:param train_feature: 训练集数据
:param train_label: 训练集标签
:param test_feature: 测试集数据
:return: 测试集预测结果
'''
#********* Begin *********#
clf=KNeighborsRegressor() #生成K近邻分类器
clf.fit(train_feature, train_label) #训练分类器
return clf.predict(test_feature)
#********* End *********#
分析红酒数据
import numpy as np
def alcohol_mean(data):
'''
返回红酒数据中红酒的酒精平均含量
:param data: 红酒数据对象
:return: 酒精平均含量,类型为float
'''
#********* Begin *********#
return data.data[:,0].mean()
#********* End **********#
对数据进行标准化
from sklearn.preprocessing import StandardScaler
def scaler(data):
'''
返回标准化后的红酒数据
:param data: 红酒数据对象
:return: 标准化后的红酒数据,类型为ndarray
'''
#********* Begin *********#
scaler = StandardScaler()
after_scaler = scaler.fit_transform(data['data'])
return after_scaler
#********* End **********#
使用kNN算法进行预测
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
def classification(train_feature, train_label, test_feature):
'''
对test_feature进行红酒分类
:param train_feature: 训练集数据,类型为ndarray
:param train_label: 训练集标签,类型为ndarray
:param test_feature: 测试集数据,类型为ndarray
:return: 测试集数据的分类结果
'''
#********* Begin *********#
scaler = StandardScaler()
train_feature = scaler.fit_transform(train_feature)
test_feature = scaler.transform(test_feature)
clf = KNeighborsClassifier()
clf.fit(train_feature, train_label)
return clf.predict(test_feature)
#********* End **********#
K近邻实战(鸢尾花)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, svm, tree, ensemble
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore") #忽略警告
with warnings.catch_warnings():
warnings.filterwarnings("ignore",category=DeprecationWarning)
from numpy.core.umath_tests import inner1d
# set the number of neighbors
n_neighbors = 15
# import the iris dataset
#------------------begin--------------------
iris = datasets.load_iris()
# only take the first two features
X = iris.data[:, :2]
y = iris.target
#-------------------end---------------------
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#基于K近邻分类结果描绘分类边界
#------------------begin--------------------
for weights in ['uniform', 'distance']:
# create an instance of KNN Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
#-------------------end---------------------
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i, weights = '%s')"
% (n_neighbors, weights))
plt.savefig("step3/结果/result.png")
from sklearn.ensemble import RandomForestClassifier
X, y = datasets.make_classification(n_samples=1000, n_features=4,
n_informative=2, n_redundant=0,
random_state=0, shuffle=False)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2,
random_state=0)
clf_rf.fit(X, y)
#输出clf_rf的各特征权重以及预测[0,0,0,0]的类别
#------------------begin--------------------
print(clf_rf.feature_importances_)
print(clf_rf.predict([[0,0,0,0]]))
#-------------------end---------------------
自己输入数据
输入说明:输入由四行组成,每行由一个数组成,第一行表示要生成的数据组数,第二行表示生成数据时所使用的随机状态,第三行和第四行表示待测点的特征值(待测点只有两个特征)。
输出说明,由一行组成,即输入点的类别。
# -*- coding: utf-8 -*-
from sklearn.datasets import make_blobs
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
# 请在此添加代码 完成本关任务
#
# ********** Begin *********#
samples=int(input())
random_state=int(input())
data=make_blobs(n_samples=samples, random_state=random_state)
X, Y=data
clf=KNeighborsClassifier()
clf.fit(X,Y)
x=int(input())
y=int(input())
print(clf.predict([[x,y]]))
# ********** End **********#