import numpy as np
import pandas as pd
def HammingDistance(dataArr_x, dataArr_y):
"""
返回两个样本之间的汉明距离
"""
return np.count_nonzero(dataArr_x != dataArr_y)
def countAllDistance(dataArr_All):
"""
计算所有样本之间的汉明距离并返回一个numpy矩阵
"""
dataNum = dataArr_All.shape[0]
distanceArr = np.zeros((dataNum, dataNum))
for i in range(dataNum):
j = i + 1
for j in range(j + 1, dataNum):
distanceArr[i][j] = HammingDistance(dataArr_All[i], dataArr_All[j])
distanceArr = distanceArr + distanceArr.T - np.diag(distanceArr.diagonal())
return distanceArr
def HammingFarthest(kArr, dataArr_All):
'''
用来初始化聚类中心的
'''
maxDistance = 0
for data_x in dataArr_All:
distance = 0
for i in range(kArr.__len__()):
distance = distance + HammingDistance(kArr[i], data_x)
if distance > maxDistance:
maxDistance = distance
f = data_x
return f
def distance(dataArr_x, dataArr_y, distanceArr):
'''
直接在距离数组里找到两个数据的距离,时间换空间
:param dataArr_x: 数据x
:param dataArr_y: 数据y
:param distanceArr: 距离矩阵
:return: 两个数据之间的汉明距离
'''
return distanceArr[int(dataArr_x[0])][int(dataArr_y[0])]
def addIndex(dataArr_All):
'''
给数据数组加上索引,方便后面保存csv时的操作
因为索引造成的汉明差距可以忽略,因为所有的数据之间的索引都是不同的
:param dataArr_All: 数据数组
:return: 添加索引后的数据数组
'''
id = np.zeros((dataArr_All.shape[0], 1))
for i in range(dataArr_All.shape[0]): id[i] = i
return np.append(id, dataArr_All, axis=1)
def newKarr(dataArr_All, m):
"""
初始化聚类中心
:param dataArr_All: 数据数组
:param m: 要聚多少类
:return: 聚类中心列表,和初始化的聚类列表,列表中是m个列表,代表m个类,每个列表里是一条数据
"""
r = np.random.randint(dataArr_All.__len__() - 1)
kCenterList = np.array([dataArr_All[r]])
cla_arr = [[]]
for i in range(m - 1):
# 找到一个和这个聚类中心最远的点
k = HammingFarthest(kCenterList, dataArr_All)
# 数组拼接
kCenterList = np.concatenate([kCenterList, np.array([k])])
cla_arr.append([])
return kCenterList, cla_arr
def newkCenter(dataArr_K, distanceArr):
'''
更新聚类中心
:param dataArr_K: 第k类的聚类列表
:param distanceArr: 距离数组
:return: 返回这第k类的新聚类中心
'''
Center = dataArr_K[0]
minDis= 0
for i in range(len(dataArr_K)):
minDis= minDis + distance(Center, dataArr_K[i], distanceArr)
for dataX in dataArr_K:
temp = 0
for i in range(len(dataArr_K)):
temp = temp + distance(dataX, dataArr_K[i], distanceArr)
if temp < minDis:
Center = dataX
minDis = temp
return Center
def clustering(claListTemp, n, distanceArr, dataArr_All, kCenterList):
'''
聚类
'''
for i in range(n):
for dataX in dataArr_All:
ki = 0
# print(kList[ki])
minDistance = distance(dataX, kCenterList[ki], distanceArr)
# 找到这个样本一个在哪一类
for j in range(1, len(kCenterList)):
if distance(dataX, kCenterList[j], distanceArr) < minDistance:
minDistance = distance(dataX, kCenterList[j], distanceArr)
ki = j
claListTemp[ki].append(dataX)
# 更新聚类中心点
for k in range(len(kCenterList)):
if n - 1 == i:
break
kCenterList[k] = newkCenter(claListTemp[k], distanceArr)
claListTemp[k] = []
return claListTemp
def saveCSV(claList, dataCSV, str1):
'''
保存聚类后的几个csv文件
'''
for i in range(claList.__len__()):
claK = np.array(claList[i])
claKList = claK[:, 0]
claKList = claKList.astype(int)
dataCsvTemp = dataCSV.iloc[claKList]
fileOutput = '../../output/csv/CluData' + str1 + str(claList.__len__()) + '__' + str(i) + ".csv"
dataCsvTemp.to_csv(fileOutput)
def HZ_solutions():
# ---------------------------------------------------------
dataFile = '../../data/rawData/dataHz/combineSnp_Rate.csv'
dataCSV = pd.read_csv(dataFile)
# 只需要数据部分,并添加上索引
dataArr_All = np.array(dataCSV.iloc[:, 1:-2])
dataArr_All = addIndex(dataArr_All)
np.save("../../output/numpyArrary/dataArr_AllHZ.npy", dataArr_All)
print("数据numpy数组保存成功")
# ---------------------------------------------------------
dataArr_All = np.load("../../output/numpyArrary/dataArr_AllHZ.npy")
print("数据numpy数组读取成功")
# 要聚多少类
m = 3
# 初始化聚类中心点,以及聚类列表
kCenterList, claList = newKarr(dataArr_All, m)
# ---------------------------------------------------------
# 将所有样本之间的汉明距离存入一个距离数组中,空间换时间
distanceArr = countAllDistance(dataArr_All)
np.save("../../output/numpyArrary/distanceArrHZ.npy", distanceArr)
print("距离数组保存成功")
# ---------------------------------------------------------
distanceArr = np.load("../../output/numpyArrary/distanceArrHZ.npy")
print("距离numpy数组读取成功")
# 迭代次数
n = 20
claListTemp = claList
claList = clustering(claListTemp, n, distanceArr, dataArr_All, kCenterList)
print("聚类完成")
saveCSV(claList, dataCSV, 'HZ')
print("聚类文件保存成功")
HZ_solutions()
def NewData_solutions():
# ---------------------------------------------------------
dataFile = '../../data/rawData/dataHeight/SNP_height2.csv'
dataCSV = pd.read_csv(dataFile)
print(dataCSV)
# 只需要数据部分,并添加上索引
dataArr_All = np.array(dataCSV.iloc[:, 1:])
dataArr_All = addIndex(dataArr_All)
# np.save("../../output/numpyArrary/dataArr_AllND.npy", dataArr_All)
# print("数据numpy数组保存成功")
# ---------------------------------------------------------
# dataArr_All = np.load("../../output/numpyArrary/dataArr_AllHZ.npy")
print("数据numpy数组读取成功")
# 要聚多少类
m = 3
# 初始化聚类中心点,以及聚类列表
kCenterList, claList = newKarr(dataArr_All, m)
# ---------------------------------------------------------
# 将所有样本之间的汉明距离存入一个距离数组中,空间换时间
distanceArr = countAllDistance(dataArr_All)
np.save("../../output/numpyArrary/distanceArrND.npy", distanceArr)
print("距离数组保存成功")
# ---------------------------------------------------------
distanceArr = np.load("../../output/numpyArrary/distanceArrND.npy")
print("距离numpy数组读取成功")
# 迭代次数
n = 20
claListTemp = claList
claList = clustering(claListTemp, n, distanceArr, dataArr_All, kCenterList)
print("聚类完成")
saveCSV(claList, dataCSV,'ND')
print("聚类文件保存成功")
# NewData_solutions()
对自己的数据进行聚类只需要修改最下面两个函数就可以了,这脸相当于主函数,这里我有两个数据集所以就写了两个函数,基本上只要在数据处理那几行改一下就可以了,确保在“只需要数据部分,并添加索引那块”生成的numpy数组是第一列为索引,后面是数据就行了,并且原来的csv数据要求是0,1,2,3这样的索引。
这篇博客介绍了如何使用汉明距离对数据进行聚类。通过修改提供的两个函数,可以适应不同的数据集。关键在于数据预处理,需将数据转化为numpy数组,其中第一列为索引,后续列包含数据,原始CSV数据的索引应为连续整数。
790

被折叠的 条评论
为什么被折叠?



