总目录:Python数据分析整理
本文数据以及大部分代码来自《机器学习实战》数据集
https://www.manning.com/downloads/1108
\machinelearninginaction\Ch02\datingTestSet.txt
\machinelearninginaction\Ch02\datingTestSet2.txt
KNN.py
from numpy import *
import operator
import pandas as pd
# k近临算法实现函数
# dataSet=数据集
# labels=分类的值
# k=规定周围点的个数
# 函数返回该点分类结果
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# print(sortedClassCount)
return sortedClassCount[0][0]
# 数组标准化函数
# 输入原始数据
# 输出标准化的数据,并输出原始数据的范围
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
# normDataSet = zeros(shape(dataSet))
m, n = dataSet.shape
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
b = pd.DataFrame(columns=['min'], data=minVals)
b['ranges'] = ranges
b['max'] = maxVals
return normDataSet, b
# 拆分原始数据,与训练数据的x,y
def split_data(train,test):
train_x = train[[0,1,2]]
train_y = train[[3]]
test_x = test[[0, 1, 2]]
test_y = test[[3]]
return train_x,train_y,test_x,test_y
数据分析
from k近邻算法 import KNN
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
data_file = pd.read_csv('datingTestSet2.txt', sep='\t', header=None)
# didntLike=不喜欢=1
# smallDoses=魅力一般=2
# largeDoses=极具魅力=3
data1 = data_file[data_file[3]==1]
data2 = data_file[data_file[3]==2]
data3 = data_file[data_file[3]==3]
plt.scatter(data1[0], data1[1], c='r', alpha=0.4, label='不喜欢')
plt.scatter(data2[0], data2[1], c='b', alpha=0.4, label='魅力一般')
plt.scatter(data3[0], data3[1], c='g', alpha=0.4, label='极具魅力')
plt.legend(loc=2)
plt.title('12列分类情况')
plt.savefig('12数据分布情况')
plt.show()
plt.scatter(data1[0], data1[2], c='r', alpha=0.4, label='不喜欢')
plt.scatter(data2[0], data2[2], c='b', alpha=0.4, label='魅力一般')
plt.scatter(data3[0], data3[2], c='g', alpha=0.4, label='极具魅力')
plt.legend(loc=2)
plt.title('13列分类情况')
plt.savefig('13数据分布情况')
plt.show()
plt.scatter(data1[1], data1[2], c='r', alpha=0.4, label='不喜欢')
plt.scatter(data2[1], data2[2], c='b', alpha=0.4, label='魅力一般')
plt.scatter(data3[1], data3[2], c='g', alpha=0.4, label='极具魅力')
plt.legend(loc=2)
plt.title('23列分类情况')
plt.savefig('23数据分布情况')
plt.show()
K近邻分类
from k近邻算法 import KNN
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import *
plt.rcParams['font.sans-serif'] = ['SimHei']
train_file = pd.read_csv('datingTestSet2.txt', sep='\t', header=None)
test_file = pd.read_csv('datingTestSet.txt', sep='\t', header=None)
# 缺失值检测
print(train_file[train_file.isnull() == True].count())
print(test_file[test_file.isnull() == True].count())
# 检查数据类型
print(train_file.info())
print(test_file.info())
# 将测试集的文字转换为数字
# didntLike=不喜欢=1
# smallDoses=魅力一般=2
# largeDoses=极具魅力=3
test_file.replace({'didntLike':1,'smallDoses':2,'largeDoses':3}, inplace=True)
# 数据归一化
new_train_file, train_info = KNN.autoNorm(train_file)
new_test_file, test_info = KNN.autoNorm(test_file)
print(train_info)
print(test_info)
# 具体拆分数据
train_x,train_y,test_x,test_y = KNN.split_data(new_train_file, new_test_file)
print(train_x)
print(train_y)
print(test_x)
print(test_y)
lines, columns = new_test_file.shape
def test_knn(k):
error = 0
for i in range(lines):
pred = KNN.classify0(test_x[:][i:i+1].values[0], train_x.values, train_y.T.values[0],k)
real = test_y[:][i:i+1].values[0][0]
if real != pred:
error = error+1
return error/lines
for i in range(1,6):
rate = test_knn(i)
print('k取值:', i, '错误率:', rate)
输出结果:
k取值: 1 错误率: 0.0
k取值: 2 错误率: 0.0
k取值: 3 错误率: 0.027
k取值: 4 错误率: 0.02
k取值: 5 错误率: 0.036