根据西瓜书的算法实现半监督kmeans聚类
import numpy as np import pandas as pd class kmean_half: def __init__(self, data, label): self.loaddata = data self.label = label def data_load(self): # 数据划分为有标签数据和无标签数据 label_data = self.loaddata[self.loaddata[self.label].isnull() == False] unlabel_data = self.loaddata[self.loaddata[self.label].isnull() == True] return label_data, unlabel_data # 初始化聚类中心 def clu_center_ini(self, label_data): """ :param label_data: 有标签数据 :return: """ # 获取所有类别 labels = label_data[self.label].unique() # 计算各聚类中心 clu_center = [] for lab in labels: df = label_data[label_data[self.label] == lab] df = df.drop(self.label, axis=1) df = np.array(df) clu_center.append(np.mean(df, axis=0)) return clu_center, labels # 有标记样本划分到各簇中 def label_sample_clu(self, data, labels: list): """ :param data: 有标记样本数据 :param labels: 类别列表(去重) :return: """ label_id = data[self.label].values data = data.drop(self.label, axis=1) data = np.array(data) sample_list = [] for i in range(len(labels)): sample_list.append([]) for i in range(len(data)): id = np.where(labels == label_id[i]) sample_list[id[0][0]].append(data[i]) return sample_list # 计算新的聚类中心 def new_center(self, label_list): """ :param label_list: 样本簇列表 :return: """ label_list = np.array(label_list) for i in range(len(label_list)): label_list[i] = np.array(label_list[i]) center = [] for i in range(len(label_list)): a = np.mean(label_list[i], axis=0) center.append(a) return center # 计算两个向量之间的欧氏距离 def sample_center_distance(self, x, y): return np.linalg.norm(x - y) # 样本簇更新 def new_clu(self, new_center, data): sample_list = [] for i in range(len(new_center)): sample_list.append([]) df = data.drop(self.label, axis=1) df = np.array(df) for i in range(len(df)): tmp = [] for j in range(len(new_center)): tmp.append(self.sample_center_distance(df[i], np.array(new_center[j]))) id = np.where(tmp == min(tmp)) sample_list[id[0][0]].append(df[i]) return sample_list def run(self, errors): # 加载数据 label_data, unlabel_data = self.data_load() # 初始化聚类中心 clu_center, labels = self.clu_center_ini(label_data) # 有标记样本划分到簇中 label_sample_list = self.label_sample_clu(label_data, labels) # 无标记样本划分到簇中 sample_list = self.new_clu(clu_center, unlabel_data) # 两种样本合并 samplelist = [] for i in range(len(labels)): samplelist.append(label_sample_list[i] + sample_list[i]) # 计算新的聚类中心 center = self.new_center(samplelist) # 计算新的聚类中心与旧聚类中心的距离 e = self.sample_center_distance(np.array(center), np.array(clu_center)) # 以下是常规kmeans聚类 while e > errors: # 根据新的聚类中心重新划分样本 sample_list = self.new_clu(center, unlabel_data) # 计算新的聚类中心 new_center = self.new_center(sample_list) # 计算新的聚类中心与旧聚类中心的距离 e = self.sample_center_distance(np.array(center), np.array(new_center)) center = new_center return sample_list, labels