概念、原理: 参考文章一
基本的聚类分析算法
1. K均值:
基于原型的、划分的距离技术,它试图发现用户指定个数(K)的簇。
2. 凝聚的层次距离:
思想是开始时,每个点都作为一个单点簇,然后,重复的合并两个最靠近的簇,直到尝试单个、包含所有点的簇。
3. DBSCAN:
一种基于密度的划分距离的算法,簇的个数有算法自动的确定,低密度中的点被视为噪声而忽略,因此其不产生完全聚类。
一般数学建模使用K-Means 为主,所以下面主要介绍与K-means相关的所有代码。
图像
可视化聚类结果
import time
import numpy as np
import random
import matplotlib.pyplot as plt
import operator
def func01(): #生成二维随机点
s = "8.60879466 214.2221436 34.31755286 5.913979217 7.681145748 4.019486375 10.90871211 11.70469991 20.05087612 0.064853127 273.350093 3.9550557 11.84476817 3.889739943 8.613913126 8.480019571 6.043974425 8.570848976 12.81918013 5.615367493 257.4091048 20.45940659 18.31377238 11.87079592 11.39314005 16.33021019 12.14806429 11.88364301 7.672279477 16.34202719 14.153227 8.415724972 7.681145748 14.84427145 11.70469991 7.681145748 203.2599058 11.14144184 12.43422417 7.366393142 3.841938626 5.565517548 19.19667605 19.57195838 16.23229793 3.930162208 15.52086972 7.681145748 14.96074993 7.681145748 3.868570788 5.607172981 7.681145747 7.343048335 11.3830478 8.367672595 269.2470718 11.25689469 4.054313049 59.77037809 17.90011733 152.9354127 8.895045192 11.47387804 217.9885334 16.41655679 10.99954235 10.99197317 49.95927129 5.097064639 282.4272835 191.8036898 169.0358292 1.215681537 203.0416609 3.992663098 11.24175632 7.681145748 8.598414461 11.03234216 9.507920341 7.681145748 11.10803402 70.05616308 19.18808516 -124.4069996 10.16290178 3.963490547 47.03815871 65.20500955 11.91195464 19.85413944 3.854230393 201.288385 14.11324229 252.10022 7.663750776 5.554591532 26.64990702 52.99999999"
s1= s.split()
price = []
for i in s1:
price.append(float(i))
a = "8.59915836 212.5016154 34.3234387 6.041890605 7.681145748 4.000425951 16.25347365 11.70469991 20.4796643 0.065083235 272.5459715 3.909862801 13.14991884 3.851513389 8.613972305 8.452592827 6.180914799 8.56082937 12.80176732 4.592429361 258.5398214 20.17314157 12.02624826 11.70469991 11.38255442 16.72626344 12.11399426 11.70469991 7.681145748 15.93591562 13.56496877 8.414105233 7.681145748 14.72315408 11.70469991 7.681145748 203.526058 12.01488725 12.49618346 7.681145748 5.253114892 5.615367493 20.4796643 20.4796643 20.4796643 3.975389419 14.17620289 7.681145748 14.31319047 7.681145748 3.850297776 5.582545948 7.681145746 7.339833488 10.90871211 8.381382717 268.5445777 11.93948755 3.854552421 60.03251003 17.46615064 148.7804291 8.910644828 11.50157642 208.4861314 14.79152417 11.13777332 10.96036317 50.0033217 4.893293518 279.0054234 190.9863042 167.1789847 1.149011072 202.2500523 4.138530912 11.15798461 7.681145748 8.591217613 10.90871211 9.904016462 7.681145748 11.90355638 70.07694559 19.97210214 -124.4300188 10.29039835 3.9669966 47.34180303 227.1245276 13.06061354 20.68224836 3.967604407 201.4954977 16.02890384 260.5969943 7.646654963 5.615367493 27.4540285 52.99999998"
a1 = a.split()
increase = []
for i in a1:
increase.append(float(i))
kjz1=[]
for i in price:
for j in increase:
a=[i,j]
kjz1.append(a)
return kjz1
def func02(kjz2w): #绘图
if kjz2w!=[]:
colors=['b','g','r','c','m','y','k'];s=0;
for j in kjz2w:
for i in j:
plt.plot(i[0], i[1], color=colors[s%len(colors)], marker='.')
s=s+1;
plt.ion();plt.show();plt.savefig('test1.jpg')#;plt.pause(5);plt.close();
def func03(kjz1,k): #计算初始均值,并返回初始分组
minxy=np.min(kjz1,axis=0).tolist();maxxy=np.max(kjz1,axis=0).tolist();
xjg=(maxxy[0]-minxy[0])/k;yjg=(maxxy[1]-minxy[1])/k;
meanxy=[];meanxy.append([minxy[0],minxy[1]]);meanxy.append([maxxy[0],maxxy[1]]);
for j in range(1,k-1):
meanxy.append([minxy[0]+xjg*j,minxy[1]+yjg*j])
kjz2wxy2=[[] for j in range(0,len(meanxy))];
for j in kjz1:
s=0;lslb=[];
for k in meanxy:
lslb.append([s,(j[0]-k[0])**2+(j[1]-k[1])**2]);s=s+1; #一个坐标一组
lslb.sort(reverse=False,key=operator.itemgetter(1)) #正序
kjz2wxy2[lslb[0][0]].append(j)
return kjz2wxy2
def func05(lb2): #剔除空列表
j=0;
while(True):
if len(lb2[j])<=0:
lb2.pop(j)
else:
j=j+1;
if j>=len(lb2):
break
return lb2
def func06(kjz2wxy): #求组合中心(均值)
meanxy=[];
for j in kjz2wxy:
meanxy.append(np.mean(j,axis=0).tolist())
kjz2wxy2=[[] for j in range(0,len(meanxy))];
for j in kjz2wxy:
for i in j: #点
s=0;lslb=[];
for k in meanxy:
lslb.append([s,(i[0]-k[0])**2+(i[1]-k[1])**2]);s=s+1;
lslb.sort(reverse=False,key=operator.itemgetter(1)) #正序
kjz2wxy2[lslb[0][0]].append(i)
kjz2wxy2=func05(kjz2wxy2)
return kjz2wxy2,meanxy
def func07(kjz2w,fz):
kjz2wxy=func03(kjz2w,fz) #坐标列表,分组,0-按照x轴均分
j=0;
while(True):
kjz2wxy,meanxy=func06(kjz2wxy)
if j>0 and meanxy==meanxy2:
break
meanxy2=meanxy.copy();
j=j+1;
print('迭代%d次' % (j))
func02(kjz2wxy) #绘图
if __name__=='__main__':
start=time.time();
for j in range(0,10):
kjz2w=func01()
func07(kjz2w,6) #分6组
print('Time used:',int((time.time()-start)/60*10)/10,'分钟')
我们的 结果跑出来图像不是那么美丽,可能是题目数据不太ok。
聚类数目
然后又会遇到一个问题聚类的数目是多少呢?到底怎样比较好?
参考文章: 知乎聚类数目的确定
可以选择这个图
还不知道怎么建立数据集_(:з」∠)_
代码
from sklearn.cluster import KMeans
from yellowbrick.cluster.elbow import kelbow_visualizer
from yellowbrick.datasets.loaders import load_nfl
X, y = load_nfl()
# Use the quick method and immediately show the figure
kelbow_visualizer(KMeans(random_state=4), X, k=(2,10))
或者这个图
在这里插入代码片
记得说明为啥选这个点哦~
聚类具体的分类情况
spssau自动处理: 在线spss