import pygal
#加载数据集
def dataload():
list = []
aa = []
dataset = []
#打开文件
f = open('D:\python_data\K-meansdata.txt','r')
#逐行格式化
line = f.readline()
attr = line.split()
line = f.readline()
while line:
attr = line.split()
list.append(attr)
line = f.readline()
for item in list:
for temp in item:
tt = float(temp)
aa.append(tt)
dataset.append(aa)
aa = []
return dataset
#根据数据集给出中心点
def GiveK(dataset):
temp1 = 0
temp2 = 0
temp3 = 0
asp = []
bsp = []
#第一个均值点,求所有样本的均值
for item in dataset:
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
temp1 = temp1/len(dataset)
temp2 = temp2/len(dataset)
temp3 = temp3/len(dataset)
sum = temp1 + temp2 +temp3
k1=[temp1,temp2,temp3]
temp1 = 0
temp2 = 0
temp3 = 0
#对比中间均值,小的分一类,大的分一类
for item in dataset:
if item[1]+item[2]+item[3] <= sum:
asp.append(item)
else:
bsp.append(item)
#第二个均值点,求比中间均值小的所有样本的均值
for item in asp:
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
temp1 = temp1/len(asp)
temp2 = temp2/len(asp)
temp3 = temp3/len(asp)
k2 = [temp1,temp2,temp3]
temp1 = 0
temp2 = 0
temp3 = 0
#第三个均值点,求比中间均值大的所有样本的均值
for item in bsp:
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
temp1 = temp1/len(bsp)
temp2 = temp2/len(bsp)
temp3 = temp3/len(bsp)
k3 = [temp1,temp2,temp3]
return k1,k2,k3
#计算距离
def Cal(dataset,k1,k2,k3):
clsa = []
clsb = []
clsc = []
temp1 = 0
temp2 = 0
temp3 = 0
temp4 = 0
temp5 = 0
temp6 = 0
temp7 = 0
temp8 = 0
temp9 = 0
#计算到三个均值点的距离,并根据大小分到对应的类里
for item in dataset:
a = pow(item[1]-k1[0],2)+pow(item[2]-k1[1],2)+pow(item[3]-k1[2],2)
b = pow(item[1]-k2[0],2)+pow(item[2]-k2[1],2)+pow(item[3]-k2[2],2)
c = pow(item[1]-k3[0],2)+pow(item[2]-k3[1],2)+pow(item[3]-k3[2],2)
if a <= b and a <= c:
clsa.append(item)
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
elif b <= a and b <= c:
clsb.append(item)
temp4 += item[1]
temp5 += item[2]
temp6 += item[3]
elif c <= a and c <= b:
clsc.append(item)
temp7 += item[1]
temp8 += item[2]
temp9 += item[3]
m = [temp1/len(clsa),temp2/len(clsa),temp3/len(clsa)]
n = [temp4/len(clsb),temp5/len(clsb),temp6/len(clsb)]
q = [temp7/len(clsc),temp8/len(clsc),temp9/len(clsc)]
#返回新的三个均值点以及三个类
return m,n,q,clsa,clsb,clsc
def K_means(dataset):
lista = []
listb = []
listc = []
strlist = []
i = 1
#给出三个均值点
k1,k2,k3=GiveK(dataset)
#给出新的均值点以及分类结果
a,b,c,clsa,clsb,clsc = Cal(dataset,k1,k2,k3)
#循环,直到结果与上一次相同
while k1 != a:
str1 = '第%d次均值点'%i+'\n'+str(a)+'\n'+str(b)+'\n'+str(c)+'\n'
strlist.append(str1)
k1 = a
k2 = b
k3 = c
#取出每个样本的序号
for item in clsa:
lista.append(int(item[0]))
for item in clsb:
listb.append(int(item[0]))
for item in clsc:
listc.append(int(item[0]))
str2 = '第%d次分类结果:'%i+'\n'+str(lista)+'\n'+str(listb)+'\n'+str(listc)+'\n'
strlist.append(str2)
lista=[]
listb=[]
listc=[]
i+=1
a,b,c,clsa,clsb,clsc = Cal(dataset,k1,k2,k3)
for item in clsa:
lista.append(int(item[0]))
for item in clsb:
listb.append(int(item[0]))
for item in clsc:
listc.append(int(item[0]))
str1 = '最终均值点'+'\n'+str(a)+'\n'+str(b)+'\n'+str(c)+'\n'
st = '分类结果:\n'+'第一类:'+str(lista)+'\n'+'第二类:'+str(listb)+'\n'+'第三类:'+str(listc)+'\n'
strlist.append(str1)
strlist.append(st)
#输出到GUI界面
return strlist
#分类展示图表
def showdot():
dataset = dataload()
k1,k2,k3=GiveK(dataset)
lista = []
listb = []
listc = []
temp = []
a,b,c,clsa,clsb,clsc = Cal(dataset,k1,k2,k3)
#分类放入元组
for item in clsa:
temp.append(item[1])
temp.append(item[2])
lista.append(tuple(temp))
temp = []
for item in clsb:
temp.append(item[1])
temp.append(item[2])
listb.append(tuple(temp))
temp = []
for item in clsc:
temp.append(item[1])
temp.append(item[2])
listc.append(tuple(temp))
temp = []
dotchart = pygal.XY(stroke = False)
dotchart.title = 'DotShow'
dotchart.add('A',lista)
dotchart.add('B',listb)
dotchart.add('C',listc)
dotchart.render_in_browser()
#K_means(dataload())
#dataload()
#GiveK(dataload())
#加载数据集
def dataload():
list = []
aa = []
dataset = []
#打开文件
f = open('D:\python_data\K-meansdata.txt','r')
#逐行格式化
line = f.readline()
attr = line.split()
line = f.readline()
while line:
attr = line.split()
list.append(attr)
line = f.readline()
for item in list:
for temp in item:
tt = float(temp)
aa.append(tt)
dataset.append(aa)
aa = []
return dataset
#根据数据集给出中心点
def GiveK(dataset):
temp1 = 0
temp2 = 0
temp3 = 0
asp = []
bsp = []
#第一个均值点,求所有样本的均值
for item in dataset:
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
temp1 = temp1/len(dataset)
temp2 = temp2/len(dataset)
temp3 = temp3/len(dataset)
sum = temp1 + temp2 +temp3
k1=[temp1,temp2,temp3]
temp1 = 0
temp2 = 0
temp3 = 0
#对比中间均值,小的分一类,大的分一类
for item in dataset:
if item[1]+item[2]+item[3] <= sum:
asp.append(item)
else:
bsp.append(item)
#第二个均值点,求比中间均值小的所有样本的均值
for item in asp:
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
temp1 = temp1/len(asp)
temp2 = temp2/len(asp)
temp3 = temp3/len(asp)
k2 = [temp1,temp2,temp3]
temp1 = 0
temp2 = 0
temp3 = 0
#第三个均值点,求比中间均值大的所有样本的均值
for item in bsp:
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
temp1 = temp1/len(bsp)
temp2 = temp2/len(bsp)
temp3 = temp3/len(bsp)
k3 = [temp1,temp2,temp3]
return k1,k2,k3
#计算距离
def Cal(dataset,k1,k2,k3):
clsa = []
clsb = []
clsc = []
temp1 = 0
temp2 = 0
temp3 = 0
temp4 = 0
temp5 = 0
temp6 = 0
temp7 = 0
temp8 = 0
temp9 = 0
#计算到三个均值点的距离,并根据大小分到对应的类里
for item in dataset:
a = pow(item[1]-k1[0],2)+pow(item[2]-k1[1],2)+pow(item[3]-k1[2],2)
b = pow(item[1]-k2[0],2)+pow(item[2]-k2[1],2)+pow(item[3]-k2[2],2)
c = pow(item[1]-k3[0],2)+pow(item[2]-k3[1],2)+pow(item[3]-k3[2],2)
if a <= b and a <= c:
clsa.append(item)
temp1 += item[1]
temp2 += item[2]
temp3 += item[3]
elif b <= a and b <= c:
clsb.append(item)
temp4 += item[1]
temp5 += item[2]
temp6 += item[3]
elif c <= a and c <= b:
clsc.append(item)
temp7 += item[1]
temp8 += item[2]
temp9 += item[3]
m = [temp1/len(clsa),temp2/len(clsa),temp3/len(clsa)]
n = [temp4/len(clsb),temp5/len(clsb),temp6/len(clsb)]
q = [temp7/len(clsc),temp8/len(clsc),temp9/len(clsc)]
#返回新的三个均值点以及三个类
return m,n,q,clsa,clsb,clsc
def K_means(dataset):
lista = []
listb = []
listc = []
strlist = []
i = 1
#给出三个均值点
k1,k2,k3=GiveK(dataset)
#给出新的均值点以及分类结果
a,b,c,clsa,clsb,clsc = Cal(dataset,k1,k2,k3)
#循环,直到结果与上一次相同
while k1 != a:
str1 = '第%d次均值点'%i+'\n'+str(a)+'\n'+str(b)+'\n'+str(c)+'\n'
strlist.append(str1)
k1 = a
k2 = b
k3 = c
#取出每个样本的序号
for item in clsa:
lista.append(int(item[0]))
for item in clsb:
listb.append(int(item[0]))
for item in clsc:
listc.append(int(item[0]))
str2 = '第%d次分类结果:'%i+'\n'+str(lista)+'\n'+str(listb)+'\n'+str(listc)+'\n'
strlist.append(str2)
lista=[]
listb=[]
listc=[]
i+=1
a,b,c,clsa,clsb,clsc = Cal(dataset,k1,k2,k3)
for item in clsa:
lista.append(int(item[0]))
for item in clsb:
listb.append(int(item[0]))
for item in clsc:
listc.append(int(item[0]))
str1 = '最终均值点'+'\n'+str(a)+'\n'+str(b)+'\n'+str(c)+'\n'
st = '分类结果:\n'+'第一类:'+str(lista)+'\n'+'第二类:'+str(listb)+'\n'+'第三类:'+str(listc)+'\n'
strlist.append(str1)
strlist.append(st)
#输出到GUI界面
return strlist
#分类展示图表
def showdot():
dataset = dataload()
k1,k2,k3=GiveK(dataset)
lista = []
listb = []
listc = []
temp = []
a,b,c,clsa,clsb,clsc = Cal(dataset,k1,k2,k3)
#分类放入元组
for item in clsa:
temp.append(item[1])
temp.append(item[2])
lista.append(tuple(temp))
temp = []
for item in clsb:
temp.append(item[1])
temp.append(item[2])
listb.append(tuple(temp))
temp = []
for item in clsc:
temp.append(item[1])
temp.append(item[2])
listc.append(tuple(temp))
temp = []
dotchart = pygal.XY(stroke = False)
dotchart.title = 'DotShow'
dotchart.add('A',lista)
dotchart.add('B',listb)
dotchart.add('C',listc)
dotchart.render_in_browser()
#K_means(dataload())
#dataload()
#GiveK(dataload())