无监督深度聚类
都是最简单的构建,没有完善的显示输出,最后的效果一般般
原理可以参考原文:Xie, Junyuan, Ross Girshick, and Ali Farhadi. “Unsupervised deep embedding for clustering analysis.” International conference on machine learning. PMLR, 2016.
也可以看一下知乎的翻译:https://zhuanlan.zhihu.com/p/313662693
主要代码参考:https://naserian-elahe.medium.com/deep-embedding-and-clustering-an-step-by-step-python-implementation-bd2c9d51c80f
自动编码器
#有一些from 或者 imoprt可能不需要
from collections import OrderedDict
from cytoolz.itertoolz import concat, sliding_window
from typing import Callable, Iterable, Optional, Tuple, List
import torch
import torch.nn as nn
def build_units(dimensions:Iterable[int], activation:Optional[torch.nn.Module])->list[torch.nn.Module]:
def single_unit(in_dimension:int,out_dimension:int)->torch.nn.Module:
unit = [('linear',nn.Linear(in_dimension,out_dimension))]
if activation is not None:
unit.append(('activation',activation))
return nn.Sequential(OrderedDict(unit))
return [
single_unit(embedding_dimension, hidden_dimension)
for embedding_dimension, hidden_dimension in sliding_window(2, dimensions)
]
class AutoEncoder(nn.Module):
def __init__(
self,
dimensions:list[int],
activation: torch.nn.Module = nn.ReLU()
):
super(AutoEncoder,self).__init__()
self.dimensions = dimensions
# construct the encoder
encoder_units = build_units(self.dimensions[:-1], activation)
# end layer
encoder_units.extend(
build_units([self.dimensions[-2], self.dimensions[-1]],activation)
)
self.encoder = nn.Sequential(*encoder_units)
# construct the decoder
decoder_units = build_units(reversed(self.dimensions[1:]), activation)
decoder_units.extend(
build_units([self.dimensions[1], self.dimensions[0]], None)
)
self.decoder = nn.Sequential(*decoder_units)
def forward(self,batch:torch.Tensor)->torch.Tensor:
encoded = self.encoder(batch)
return self.decoder(encoded)
Cluster 类
#有些属性可能不需要
class ClusterAssignment(nn.Module):
def __init__(
self,
cluster_number:int,
embedding_dimension:int,
alpha:float=1.0,
cluster_centers: Optional[torch.Tensor] = None
):
super(ClusterAssignment,self).__init__()
self.embedding_dimension = embedding_dimension
self.cluster_number = cluster_number
self.alpha = alpha
if cluster_centers is None:
initial_cluster_centers = torch.zeros(
self.cluster_number, self.embedding_dimension, dtype=torch.float
)
nn.init.xavier_uniform_(initial_cluster_centers)
else:
initial_cluster_centers = cluster_centers
self.cluster_centers = Parameter(initial_cluster_centers)#把传入的数据包装成可以进行grad的tensor变量
def forward(self,batch:torch.Tensor)->torch.Tensor:
q = 1.0 / (1.0 + (torch.sum(torch.square(torch.unsqueeze(batch, axis=1) - self.cluster_centers), axis=2) / self.alpha))
q **= (self.alpha + 1.0) / 2.0
q = torch.transpose(torch.transpose(q, 0, 1) / torch.sum(q, axis=1), 0, 1)
return q
@staticmethod
def target_distribution(batch:torch.Tensor)->torch.Tensor:
weight = batch ** 2 / torch.sum(batch,0)
return (weight.T / torch.sum(weight,1)).T
DEC类
from torch.nn import Parameter
import os
class dec(nn.Module):
def __init__(
self,
AutoEncode:nn.Module,
cluster_number:int,
dimensions:list[int],
activation: torch.nn.Module = nn.ReLU(),
alpha:float=1.0
):
super(dec,self).__init__()
self.cluster_number = cluster_number
self.dimensions = dimensions
self.activation = activation
self.alpha = alpha
self.autoencode = AutoEncode
self.assignment = ClusterAssignment(cluster_number =self.cluster_number,
embedding_dimension=self.dimensions[-1])
self.model = nn.Sequential(
self.autoencode.encoder,
self.assignment
)
def forward(self,batch:torch.Tensor):
return self.model(batch)
加载数据集
fashion minist
import torchvision.transforms as transforms
import torchvision.datasets as datasets
batch_size = 64
train_dataset = datasets.FashionMNIST(root='C:/Users/79414/Desktop/code/pytorch_learning/data/', train=True, transform=transforms.ToTensor(), download=False)
test_dataset = datasets.FashionMNIST(root='C:/Users/79414/Desktop/code/pytorch_learning/data/', train=False, transform=transforms.ToTensor(), download=False)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
自动编码器的预训练和DEC的训练函数
import numpy as np
from torch.autograd import Variable
##pretrain 和 train 都是自动编码器的
def pretrain(model:nn.Module,x,test,echo:int,y=None):
if not os.path.exists('ae_weights.pth'):
train(model,x,test,echo)
torch.save(model.state_dict(), os.path.join('ae_weights.pth'))
else:
model.load_state_dict(torch.load('ae_weights.pth'))
def train(model:nn.Module,x,test,echo:int,y=None):
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
for i in range(echo):
size=len(x)
model.train()
#train
for batch,(x1,y) in enumerate(x):
pred = model(x1.view(-1,28*28))
loss = loss_fn(pred,x1.view(-1,28*28))
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch%400==0:
loss,current = loss.item(),(batch+1)#*len(x1)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}] [{batch:>5d}]")
#test
num_batches = len(test)
model.eval()
test_loss,correct = 0,0
with torch.no_grad():
for X,y in test:
#X,y = X.to(device),y.to(device)
pred = model(X.view(-1,28*28))
test_loss+=loss_fn(pred,X.view(-1,28*28)).item()
correct+=(pred.argmax(1)==y).type(torch.float).sum().item()
test_loss /= num_batches
correct /=size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} ,echo:{i:>5d}/{echo}\n")
#下面的fit是训练dec的
def fit2(model:nn.Module,
X,
n_clusters,
dimensions,
maxiter = 20,
batch_size = 256,
tol=1e-3,
update_interval=20,
y=None):
print('Initializing cluster centers with k-means.')
#
model.train()
with torch.no_grad():
train_data_init = Variable(train_dataset.data.type(torch.float)/255.0)
#step 1 initial cluster centers using Kmeans
kmeans = KMeans(n_clusters=10)#init Kmeans
features = model.autoencode.encoder(train_data_init.view(-1,28*28)).detach().numpy()
y_pred = kmeans.fit_predict(features)# fit
y_pred_last = np.copy(y_pred)# copy
#update clus attribute
# initialise the cluster centers
model.state_dict()["assignment.cluster_centers"].copy_(torch.tensor(
kmeans.cluster_centers_, dtype=torch.float, requires_grad=True)
)
#step2 dec
#注意这里的optimizer用原文的,损失函数也用原文指出的
optimizer = torch.optim.SGD(model.parameters(),lr=0.001,momentum=0.9)
loss_function = nn.KLDivLoss(reduction='batchmean')
delta_label=None
for ite in range(int(maxiter)):
for batch,(x,y) in enumerate(X):
with torch.no_grad():
q = model(x.view(-1,28*28))
p = model.assignment.target_distribution(q)
y_pred = q.argmax(1)#.clone().detach().numpy()
optimizer.zero_grad()
with torch.set_grad_enabled(True):
outputs = model(x.view(-1,28*28))
loss = loss_function(outputs.log(), p)
loss.backward()
optimizer.step()
if batch%600==0:
loss,current = loss.item(),(batch+1)*64
print(f"loss: {loss:>7f} {current:>d}/{train_data_init.shape[0]}")
#这里是把所有的训练样本送进去获取预测值,然后和上一次的比对一下
y_pred = model(train_data_init.view(-1,28*28)).argmax(1)
delta_label = np.sum(y_pred_last!= y_pred.clone().detach().numpy()) / y_pred.shape[0]
#print(np.sum(y_pred_last!= y_pred.clone().detach().numpy()))
if ite>0 and delta_label<tol:
print("delta",delta_label)
break
y_pred_last = y_pred.detach().clone().numpy()
print(f"update:{ite:>5d},del:{delta_label}")
下图为实测 minist 手写数据的结果