根据皮尔逊相关系数获取所有数据的相关性,相关性最强的进行聚类,一层层处理完毕数据。
python代码:
读取数据
#读取数据 def readfile(filename): lines = [line for line in open(filename)] # 列标题 colnames = lines[0].strip().split('\t')[1:] rownames = [] data = [] for line in lines[1:]: p = line.strip().split('\t') # 第一列是标题 rownames.append(p[0]) # 数据 data.append([float(x) for x in p[1:]]) return rownames, colnames, data
#皮尔逊相关系数用于找寻相关度最高的博客 def pearson(v1, v2): # 求和 sum1 = sum(v1) sum2 = sum(v2) # 求平方和 sum1Sq = sum([pow(v, 2) for v in v1]) sum2Sq = sum([pow(v, 2) for v in v2]) # 乘积和 pSum = sum([v1[i] * v2[i] for i in range(len(v1))]) # 皮尔逊系数 num = pSum - (sum1 * sum2 / len(v1)) den = sqrt((sum1Sq - pow(sum1, 2) / len(v1)) * (sum2Sq - pow(sum2, 2) / len(v1))) if den == 0: return 0 return 1.0 - num / den
#聚类计算 def hcluster(rows,distance=pearson): distances={} currentclustid=-1 # 存储带聚类数据 clust=[bicluster(rows[i],id=i) for i in range(len(rows))] while len(clust)>1: #存储pearson最相关数据ID lowestpair=(0,1) closest=distance(clust[0].vec,clust[1].vec) # 遍历寻找最小距离 for i in range(len(clust)): for j in range(i+1,len(clust)): # distances 存储所有数据pearson数据 if (clust[i].id,clust[j].id) not in distances: distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) d=distances[(clust[i].id,clust[j].id)] if d<closest: closest=d lowestpair=(i,j) # 计算聚类的平均值 mergevec=[ (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))] # 建立新的聚类(排除下级最相关pearson) newcluster=bicluster(mergevec,left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest,id=currentclustid) # 不在原始聚类中ID为负数 currentclustid-=1 del clust[lowestpair[1]] del clust[lowestpair[0]] clust.append(newcluster) for c in clust: print(c.id,c.left,c.right,c.distance,c.vec) print('finish') return clust[0]