一、说明
我是在jupyter完成的,然后导出成markdown格式,ipynb文件导出为markdown的命令如下:
jupyter nbconvert --to markdown xxx.ipynb
二、题目
Iris数据集在模式识别学习中十分常见了。这个数据集里一共包括150行记录,其中前四列为花萼长度,花萼宽度,花瓣长度,花瓣宽度等4个用于识别鸢尾花的属性,第5列为鸢尾花的类别(包括Setosa,Versicolour,Virginica三类)。
即通过判定花萼长度,花萼宽度,花瓣长度,花瓣宽度的尺寸大小来识别鸢尾花的类别。
这个数据集可从UCI数据集上下载,具体地址为:http://archive.ics.uci.edu/ml/datasets/Iris。
也可以直接从sklearn包里datasets里导入,语法为:from sklearn.datasets import load_iris。
还可在这篇博文中获取,详情点击这里
三、实践部分
四、源代码
import pandas as pd
# http://archive.ics.uci.edu/ml/datasets/Iris
# from sklearn import datasets
# iris = datasets.load_iris()
iris_data=pd.read_csv('iris.csv')
iris_data.head()
No | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|---|
0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
type(iris_data)
pandas.core.frame.DataFrame
# 删除序号列
iris_data_01 = iris_data.drop('No',axis=1)
iris_data_01.head()
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# 注明表头
iris_data_01 .columns=['sepal_lengh_cm','sepal_width_cm','petal_length_cm','petal_width_cm','class']
iris_data_01.head()
sepal_lengh_cm | sepal_width_cm | petal_length_cm | petal_width_cm | class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
iris_data_01.describe()
sepal_lengh_cm | sepal_width_cm | petal_length_cm | petal_width_cm | |
---|---|---|---|---|
count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
mean | 5.843333 | 3.057333 | 3.758000 | 1.199333 |
std | 0.828066 | 0.435866 | 1.765298 | 0.762238 |
min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
# 数据可视化
def scatter_plot_by_category(feat, x, y):
alpha = 0.5
gs = iris_data_01.groupby(feat)
cs = cm.rainbow(np.linspace(0, 1, len(gs)))
for g, c in zip(gs, cs):
plt.scatter(g[1][x], g[1][y], color=c, alpha=alpha)
plt.figure(figsize=(20,5))
plt.subplot(131)
scatter_plot_by_category('class', 'sepal_lengh_cm', 'petal_length_cm')
plt.xlabel('sepal_len')
plt.ylabel('petal_len')
plt.title('class')
Text(0.5, 1.0, 'class')
import seaborn as sb
plt.figure(figsize=(20, 10))
for column_index, column in enumerate(iris_data_01.columns):
if column == 'class':
continue
plt.subplot(2, 2, column_index + 1)
sb.violinplot(x='class', y=column, data=iris_data_01)
# 首先对数据进行切分,即分出数据集和测试集
from sklearn.model_selection import train_test_split #引入数据集拆分的模块
all_inputs = iris_data_01[['sepal_lengh_cm', 'sepal_width_cm','petal_length_cm', 'petal_width_cm']].values
all_classes = iris_data_01['class'].values
# 划分训练集
(X_train,
X_test,
Y_train,
Y_test) = train_test_split(all_inputs, all_classes, train_size=0.8, random_state=1)
关于 train_test_split 函数参数的说明:
train_data:被划分的样本特征集
train_target:被划分的样本标签
test_size:float-获得多大比重的测试样本 (默认:0.25)
int - 获得多少个测试样本
random_state:是随机数的种子。
X_train[:10]
array([[6.1, 3. , 4.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[5.6, 2.5, 3.9, 1.1],
[6.4, 2.8, 5.6, 2.1],
[5.8, 2.8, 5.1, 2.4],
[5.3, 3.7, 1.5, 0.2],
[5.5, 2.3, 4. , 1.3],
[5.2, 3.4, 1.4, 0.2],
[6.5, 2.8, 4.6, 1.5],
[6.7, 2.5, 5.8, 1.8]])
# 使用决策树算法进行训练
from sklearn.tree import DecisionTreeClassifier
# 定义一个决策树对象
decision_tree_classifier = DecisionTreeClassifier()
# 训练模型
model = decision_tree_classifier.fit(X_train, Y_train)
# 所得模型的准确性
decision_tree_classifier.score(X_test, Y_test)
0.9666666666666667
X_test[0:3]
array([[5.8, 4. , 1.2, 0.2],
[5.1, 2.5, 3. , 1.1],
[6.6, 3. , 4.4, 1.4]])
Y_test[0:3]
array(['setosa', 'versicolor', 'versicolor'], dtype=object)
# 简单预测test集合前三个
model.predict(X_test[0:3])
array(['setosa', 'versicolor', 'versicolor'], dtype=object)
# 用kNN算法
from sklearn import neighbors
from sklearn import datasets # 可以从这里拿到iris的原始数据
# 创建一个knn分类器
knn = neighbors.KNeighborsClassifier()
# 创建模型
knn.fit(X_train, Y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
# 尝试预测
knn.predict([[0.1, 0.2, 0.3, 0.4]])
array(['setosa'], dtype=object)
# 查看预测的准确性
knn.score(X_test, Y_test)
1.0
X_test[0:3]
array([[5.8, 4. , 1.2, 0.2],
[5.1, 2.5, 3. , 1.1],
[6.6, 3. , 4.4, 1.4]])
Y_test[0:3]
array(['setosa', 'versicolor', 'versicolor'], dtype=object)
model.predict(X_test[0:3])
array(['setosa', 'versicolor', 'versicolor'], dtype=object)
# 自己实现knn算法
# 详情参考https://www.cnblogs.com/jyroy/p/9427977.html
import csv
import random
import math
import operator
# 加载数据集
def loadDataset(filename, split, trainingSet = [], testSet = []):
with open(filename, 'r') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset)-1):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split: #将数据集随机划分
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
# 计算点之间的距离,多维度的
def euclideanDistance(instance1, instance2, length):
distance = 0
for x in range(length):
# 欧式距离
distance += pow((instance1[x]-instance2[x]), 2)
return math.sqrt(distance)
# 获取k个邻居
def getNeighbors(trainingSet, testInstance, k):
distances = []
length = len(testInstance)-1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length)
distances.append((trainingSet[x], dist)) #获取到测试点到其他点的距离
distances.sort(key=operator.itemgetter(1)) #对所有的距离进行排序
neighbors = []
for x in range(k): #获取到距离最近的k个点
neighbors.append(distances[x][0])
return neighbors
# 得到这k个邻居的分类中最多的那一类
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
return sortedVotes[0][0] # 获取到票数最多的类别
#计算预测的准确率
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x][-1] == predictions[x]:
correct += 1
return (correct/float(len(testSet)))*100.0
def main():
#prepare data
trainingSet = []
testSet = []
split = 0.67
loadDataset(r'irisdata.txt', split, trainingSet, testSet)
print('Trainset: ' + repr(len(trainingSet)))
print('Testset: ' + repr(len(testSet)))
#generate predictions
predictions = []
k = 3
for x in range(len(testSet)):
# trainingsettrainingSet[x]
neighbors = getNeighbors(trainingSet, testSet[x], k)
result = getResponse(neighbors)
predictions.append(result)
print ('predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
print('predictions: ' + repr(predictions))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%')
if __name__ == '__main__':
main()
'''
详情参考https://blog.csdn.net/Asun0204/article/details/75607948
k近邻(kNN)算法的工作机制:根据某种距离测度找出距离给定待测样本距离最小的k个训练样本,根据k个训练样本进行预测。
分类问题:k个点中出现频率最高的类别作为待测样本的类别
回归问题:通常以k个训练样本的平均值作为待测样本的预测值
kNN模型三要素:距离测度、k值的选择、分类或回归决策方式
'''
import numpy as np
class KNNClassfier(object):
def __init__(self, k=5, distance='euc'):
self.k = k
self.distance = distance
self.x = None
self.y = None
def fit(self,X, Y):
'''
X : array-like [n_samples,shape]
Y : array-like [n_samples,1]
'''
self.x = X
self.y = Y
def predict(self,X_test):
'''
X_test : array-like [n_samples,shape]
Y_test : array-like [n_samples,1]
output : array-like [n_samples,1]
'''
output = np.zeros((X_test.shape[0],1))
for i in range(X_test.shape[0]):
dis = []
for j in range(self.x.shape[0]):
if self.distance == 'euc': # 欧式距离
dis.append(np.linalg.norm(X_test[i]-self.x[j,:]))
labels = []
index=sorted(range(len(dis)), key=dis.__getitem__)
for j in range(self.k):
labels.append(self.y[index[j]])
counts = []
for label in labels:
counts.append(labels.count(label))
output[i] = labels[np.argmax(counts)]
return output
def score(self,x,y):
pred = self.predict(x)
err = 0.0
for i in range(x.shape[0]):
if pred[i]!=y[i]:
err = err+1
return 1-float(err/x.shape[0])
if __name__ == '__main__':
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
# x = np.array([[0.5,0.4],[0.1,0.2],[0.7,0.8],[0.2,0.1],[0.4,0.6],[0.9,0.9],[1,1]]).reshape(-1,2)
# y = np.array([0,1,0,1,0,1,1]).reshape(-1,1)
clf = KNNClassfier(k=3)
clf.fit(x,y)
print('myknn score:',clf.score(x,y))
from sklearn.neighbors import KNeighborsClassifier
clf_sklearn = KNeighborsClassifier(n_neighbors=3)
clf_sklearn.fit(x,y)
print('sklearn score:',clf_sklearn.score(x,y))
# 0.96