python算法应用（三）——分级聚类

最新推荐文章于 2022-12-28 15:32:57 发布

_朱坚强

最新推荐文章于 2022-12-28 15:32:57 发布

阅读量1.7k

点赞数

分类专栏：集体智慧编程（python实现）

本文链接：https://blog.csdn.net/weixin_44586750/article/details/99972213

版权

集体智慧编程（python实现）专栏收录该内容

8 篇文章 3 订阅

订阅专栏

分级聚类通过连续不断地将最为相似的两两合并，来构造出一个群组的层级结构。在每次迭代的过程中，分级聚类算法会计算每两个群组间的距离，并将距离最近的两个群组合并成一个新的群组，这一过程一直重复下去，直到只剩一个群组为止。

（一）读取数据

这里的数据存在txt中

#处理文件数据 分为单词、书名、数据
def readfile(filename):
    lines=[line for line in open(filename)]
    
    #第一行是标题
    colnames=lines[0].strip().split('\t')[1:]
    rownames=[]
    data=[]
    for line in lines[1:]:
        p=line.strip().split('\t')
        #每行的第一列是书名
        rownames.append(p[0])
        #剩下部分就是该行对应的数据
        data.append([float(x) for x in p[1:]])
    return rownames,colnames,data

（二）分级聚类

算法描述：
1.遍历求出相关度（皮尔逊相关度）最高的两组向量
2.然后将两组向量合并（平均），一直到只剩一组向量
注：
1.这里引入了类，类的属性包含左右节点（即生成该类的两个子类）
2.类的属性中有id号，大于0对应的是初始类，小于0的对应的是聚类

#利用皮尔逊相关度作相关性判断
#传入的参数为两个list
def person(v1,v2):
    #简单求和
    sum1=sum(v1)
    sum2=sum(v2)
    
    #求平方和
    sum1Sq=sum([pow(v,2) for v in v1])
    sum2Sq=sum([pow(v,2) for v in v2])
    
    #求乘积之和
    pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
    
    #计算r
    num=pSum-(sum1*sum2/len(v1))
    den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
    if den==0:return 0
    
    #让相似度越大的两个元素之间的距离变得更小
    return 1.0-num/den

#代表层级数
class bicluster:
    def __init__(self,vec,left=None,right=None,distance=0.0,id_number=None):
        self.left=left
        self.right=right
        self.vec=vec
        self.id_number=id_number
        self.distance=distance

#聚类算法（直到聚为1类才停止）
def hcluster(rows,distance=person):
    distances={}
    currentclustid=-1
    
    #最开始的聚类就是数据集中的行 有多少行就有多少类
    clust=[bicluster(rows[i],id_number=i) for i in range(len(rows))]
    
    while len(clust)>1:
        lowstpair=(0,1)
        closest=distance(clust[0].vec,clust[1].vec)
        
        #遍历每一个配对，寻找最小距离
        for i in range(len(clust)):
            for j in range(i+1,len(clust)):
                #用distances来缓存距离的计算值
                if(clust[i].id_number,clust[j].id_number) not in distances:
                    distances[(clust[i].id_number,clust[j].id_number)]=distance(clust[i].vec,clust[j].vec)
                
                d=distances[(clust[i].id_number,clust[j].id_number)]
                if d<closest:
                    closest=d
                    lowstpair=(i,j)
            
        #计算两个聚类的平均值
        mergevec=[(clust[lowstpair[0]].vec[i]+clust[lowstpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
            
        #建立新的聚类
        newcluster=bicluster(mergevec,left=clust[lowstpair[0]],right=clust[lowstpair[1]],distance=closest,id_number=currentclustid)
            
        #不在原来集合中的聚类，其id为负数
        currentclustid-=1
        #先删右边的则不会对左边的产生影响
        del clust[lowstpair[1]]
        del clust[lowstpair[0]]
        clust.append(newcluster)
    return clust[0]

（三）画出树状图

注：
1.首先要确定最大高度以及最大宽度
2.每个枝节点存在两个左右字枝节点，一直递归知道不存在为止
3.作垂直线的位置没搞明白。。。

def getheight(clust):
    #这是一个叶节点？若是，则高度为1
    if clust.left==None and clust.right==None: return 1
    
    #否则，高度为每个分支的高度之和
    return getheight(clust.left)+getheight(clust.right)

def getdepth(clust):
    #一个叶节点的距离是0
    if clust.left==None and clust.right==None: return 0
    #一个枝节点的距离等于左右两侧分支中距离较大者
    #加上该枝节点自身的距离(生成该枝节点的两类的距离)
    return max(getdepth(clust.left),getdepth(clust.right))+clust.distance

def drawnode(draw,clust,x,y,scaling,labels):
    if clust.id_number<0:
        h1=getheight(clust.left)*20
        h2=getheight(clust.right)*20
        top=y-(h1+h2)/2
        bottom=y+(h1+h2)/2
        #线的长度
        ll=clust.distance*scaling
        #聚类到其子节点的垂直线
        draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
        
        #连接左侧节点的水平线
        draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
        
        #连接右侧节点的水平线
        draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
        
        #调用函数绘制左右节点
        drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
        drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
    else:
        #如果是一个叶节点，则绘制节点的标签
        draw.text((x+5,y-7),labels[clust.id_number],(0,0,0))

#画出树状图
def drawdendrogram(clust,labels,jpeg='clusters.jpg'):
    #高度和宽度
    h=getheight(clust)*20
    w=1200
    #最长的distance
    depth=getdepth(clust)
    
    #由于宽度固定，要对距离值做响应的调整
    scaling=float(w-150)/depth
    
    #新建一个白色背景的图片
    img=Image.new('RGB',(w,h),(255,255,255))
    draw=ImageDraw.Draw(img)
    
    draw.line((0,h/2,10,h/2),fill=(255,0,0))
    
    #画第一个节点
    drawnode(draw,clust,10,(h/2),scaling,labels)
    img.save(jpeg,'JPEG')

如果不作树状图的话，直接在聚类计算的时候打印可能更方便。。。

（四）K-均值聚类

K-均值聚类中，需要告诉算法希望生成的聚类数量，算法会根据数据的结构状况来确定聚类的大小。
算法描述：
1.找出各个向量中每个元素的最大值与最小值，生成范围
2.利用随机数函数生成n个随机向量（n即为希望生成的聚类数量）
3.遍历各个向量，找出他们各自最接近的那个随机向量
4.对随机向量进行重新定位（均值法）
5.直到随机变量不再发生变化即稳定

from clusters import readfile,person
import random

#K-均值聚类
def kcluster(rows,distance=person,k=4):
    #确定每一个点的最小值和最大值,对各向量而言
    ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))]
    
    #随机建立k个点
    clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)]
    
    lastmatches=None
    for t in range(100):
        print('Iteration %d' %t)
        bestmatches=[[] for i in range(k)]
        
        #在每一行中寻找距离最近的中心点
        for j in range(len(rows)):
            row=rows[j]
            bestmatch=0;
            for i in range(k):
                d=distance(clusters[i],row)
                if d<distance(clusters[bestmatch],row):bestmatch=i
            bestmatches[bestmatch].append(j)
        
        #如果结果与上一次相同，则整个过程结束
        if bestmatches==lastmatches:break
        lastmatches=bestmatches
        
        #将中心点移到其所有成员的平均位置处
        for i in range(k):
            avgs=[0.0]*len(rows[0])
            if len(bestmatches[i])>0:
                for rowid in bestmatches[i]:
                    for m in range(len(rows[rowid])):
                        avgs[m]+=rows[rowid][m]
                for j in range(len(avgs)):
                    avgs[j]/=len(bestmatches[i])
                clusters[i]=avgs
    
    return bestmatches   
    
blognames,words,data=readfile('blogdata.txt')
kclust=kcluster(data,k=4)
#打印出聚类后的四类的博客名
print([[blognames[i] for i in kclust[j]] for j in range(4)])

_朱坚强

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
python算法应用（三）——分级聚类

分级聚类通过连续不断地将最为相似的两两合并，来构造出一个群组的层级结构。在每次迭代的过程中，分级聚类算法会计算每两个群组间的距离，并将距离最近的两个群组合并成一个新的群组，这一过程一直重复下去，直到只剩一个群组为止。（一）读取数据这里的数据存在txt中#处理文件数据分为单词、书名、数据def readfile(filename): lines=[line for line in ...
复制链接

扫一扫

专栏目录