一.项目动机
人工智能挑战性课程,第二次项目对数据进行聚类,当然这也是之后任何无监督聚类或者是无监督机器学习的理论基础
二.项目代码
import numpy
import matplotlib.pyplot as plt
import random
from sklearn.datasets import make_blobs
import codecs
from sklearn.cluster import KMeans
from collections import defaultdict
from itertools import chain
import math
import matplotlib.pyplot as plt
def get_data(filepath):
"""处理数据"""
indata=codecs.open(filepath,"r","utf-8").readlines()
dataset=list()
number=0
nodedata=defaultdict(list)
a = 0
b = 0
c = 0
for line in indata:
line=line.strip()
line=line.replace(',',' ')
strlist=line.split()
if strlist[4] == 'Iris-setosa':
a = a + 1
if strlist[4] == 'Iris-versicolor':
b = b + 1
if strlist[4] == 'Iris-virginica':
c = c + 1
#print(strlist[0:4])
numlist = list()
for item in strlist[0:4]:
num=float(item)
numlist.append(num)
numlist.append(strlist[-1])
dataset.append(numlist)
number=number+1
nodedata[number]=numlist
return dataset,nodedata,number,a,b,c
def get_every_max_and_deal(dataset,number):
a=0
b=0
c=0
d=0
for i in dataset:
a1,b1,c1,d1=i[0],i[1],i[2],i[3]
if a1>a:
a=a1
if b1>b:
b=b1
if c1>c:
c=c1
if d1>d:
d=d1
pass
num=[a,b,c,d]
return num
def get_distance(vec1,vec2):
"""计算欧氏距离"""
return numpy.sqrt(numpy.sum(numpy.square(vec1-vec2)))
def get_ma_distance(vec1,vec2):
X=numpy.vstack([vec1,vec2])
XT=X.T
S=numpy.cov(X)
SI=numpy.linalg.inv(S)
n=XT.shape[0]
d1=[]
for i in range(0,n):
for j in range(i+1):
delta=XT[i]-XT[j]
d=numpy.sqrt(numpy.dot(delta,SI),delta.T)
d1.append(d)
return d1
def initcenter(dataset,k):
"""初始化K个质心,随机获取"""
return random.sample(dataset,k)
def Kmeans(dataset,k,number,nodedata,num):
"""聚类,先随机选取聚类中心,然后分别计算,迭代100次"""
distance={}
for i in range(number):
distance[i+1]=100000 #距离聚类中心的初始值初始化无穷大
initcenter_node=initcenter(dataset,k)
cu_lei=defaultdict(list) #存储每一个类
now_distance=defaultdict(list)
for line in range(number):
line_node_data=nodedata[line+1]
for j in range(k):
every=initcenter_node[j]
vec1=numpy.mat(line_node_data[0:4]) #列表转换为矩阵
vec2=numpy.mat(every[0:4])
vec1=vec1/num
vec2=vec2/num
now_distance[line+1].append(get_distance(vec1,vec2)) #计算欧式距离
index=numpy.argmin(now_distance[line+1]) #返回其最小值的位置
cu_lei[index+1].append(line_node_data) #将该点加入到这个簇类中
pass
pass
time=0
while time<=300:
initcenter_new_node = [] # 存储每一个聚类的均值中心点
for m in range(k):
coming_list=cu_lei[m+1]
vec=numpy.mat([0,0,0,0])
j=0
for line in coming_list:
vec=vec+numpy.mat(line[0:4])
j=j+1
pass
#print("-------------------")
new_center_node=numpy.array(vec/j) #每个聚类中心的均值点
new_center_node=new_center_node.tolist()
new_center_node=list(chain.from_iterable(new_center_node))
#重新转换为列表类型
initcenter_new_node.append(new_center_node)
now_distance=defaultdict(list)
cu_lei = defaultdict(list) # 存储每一个类
for line in range(number):
line_new_node_data=nodedata[line+1]
for each in range(k):
every = initcenter_new_node[each]
vec1 = numpy.mat(line_new_node_data[0:4]) # 列表转换为矩阵
vec2 = numpy.mat(every[0:4])
vec1 = vec1 / num
vec2 = vec2 / num
now_distance[line + 1].append(get_distance(vec1, vec2)) # 计算欧式距离
index = numpy.argmin(now_distance[line + 1]) # 返回其最小值的位置
cu_lei[index + 1].append(line_new_node_data) # 将该点加入到这个簇类中
time=time+1
return cu_lei
dataset,nodedata,number,one,two,three=get_data('data')
num=get_every_max_and_deal(dataset, number)
a=initcenter(dataset,3)
ans=Kmeans(dataset,3,number,nodedata,num)
node_new=[[] for _ in range(3)]
for line in range(3):
node=ans[line+1]
for j in node:
node_new[line].append(j[4])
def get_new_data_test(node_new):
for line in node_new:
print(line)
print("____________________________________")
biaozhun=len(line)
maxlabel=max(line,key=line.count)
find_number=line.count(maxlabel)
print("分类为%s的准确率为:"%maxlabel)
print(find_number/biaozhun)
pass
get_new_data_test(node_new)
plt.plot(node_new[0])
plt.plot(node_new[1])
plt.plot(node_new[2])
plt.show()