k-means聚类torch_gpu版本

class KMEANS:
    def __init__(self, n_clusters=10, max_iter=None, verbose=True, device=torch.device("cpu")):

        # self.n_cluster = n_clusters
        self.n_clusters = n_clusters
        self.labels = None
        self.dists = None  # shape: [x.shape[0],n_cluster]
        self.centers = None
        self.variation = torch.Tensor([float("Inf")]).to(device)
        self.verbose = verbose
        self.started = False
        self.representative_samples = None
        self.max_iter = max_iter
        self.count = 0
        self.device = device

    def fit(self, x):
        # 随机选择初始中心点,想更快的收敛速度可以借鉴sklearn中的kmeans++初始化方法
        init_row = torch.randint(0, x.shape[0], (self.n_clusters,)).to(self.device)
        # print(init_row.shape)    # shape 10
        init_points = x[init_row]
        # print(init_points.shape) # shape (10, 2048)
        self.centers = init_points
        while True:
            # 聚类标记
            self.nearest_center(x)
            # 更新中心点
            self.update_center(x)
            if self.verbose:
                print(self.variation, torch.argmin(self.dists, (0)))
            if torch.abs(self.variation) < 1e-3 and self.max_iter is None:
                break
            elif self.max_iter is not None and self.count == self.max_iter:
                break

            self.count += 1

        return self.representative_sample()

    def nearest_center(self, x):
        labels = torch.empty((x.shape[0],)).long().to(self.device)
        # print(labels.shape)  # shape (250000)
        dists = torch.empty((0, self.n_clusters)).to(self.device)
        # print(dists.shape)   # shape (0, 10)
        for i, sample in tqdm(enumerate(x)):
            # print(self.centers.shape) # shape(10, 2048)
            # print(sample.shape)       # shape 2048
            dist = torch.sum(torch.mul(sample - self.centers, sample - self.centers), (1))
            # print(dist.shape)         # shape 10
            labels[i] = torch.argmin(dist)
            # print(labels.shape)       # shape 250000
            # print(labels[:10])
            dists = torch.cat([dists, dist.unsqueeze(0)], (0))
            # print(dists.shape)        # shape (1,10)
            # print('*')
        self.labels = labels           # shape 250000
        if self.started:
            self.variation = torch.sum(self.dists - dists)
        self.dists = dists              # 250000, 10
        self.started = True

    def update_center(self, x):
        centers = torch.empty((0, x.shape[1])).to(self.device) # shape (0, 250000)
        for i in range(self.n_clusters):
            mask = self.labels == i
            cluster_samples = x[mask]
            centers = torch.cat([centers, torch.mean(cluster_samples, (0)).unsqueeze(0)], (0))  # 10, 2048
        self.centers = centers  # shape (10, 2048)

    def representative_sample(self):
        # 查找距离中心点最近的样本,作为聚类的代表样本,更加直观
        # print(self.dists.shape)
        self.representative_samples = torch.argmin(self.dists, 1)
        # print(self.representative_samples.shape)  # shape 250000
        # print('*')
        return self.representative_samples


def time_clock(matrix, device):
    a = time.time()
    k = KMEANS(max_iter=10,verbose=False,device=device)
    classifier_n_labels_result = k.fit(matrix)
    b = time.time()
    return (b-a)/k.count


def choose_device(cuda=False):
    if cuda:
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")
    return device


if __name__ == "__main__":
    # import matplotlib.pyplot as plt
    import pandas as pd
    import torch
    import pickle
    from tqdm import tqdm

    device = choose_device(True)

    # read data
    train_ava_data_list = list(pd.read_csv('train_val_dataset.txt', header=None, sep=' ', index_col=False)[0])
    test_ava_data_list = list(pd.read_csv('test_dataset_19929.txt', header=None, sep=' ', index_col=False)[0])
    all_data_list = train_ava_data_list + test_ava_data_list
    print(len(all_data_list))
    all_data_tensor = torch.empty(0, 2048).cuda()
    for i in tqdm(all_data_list):
        elem_path = '/home/flyingbird/Data/feature_extract/feature_2048/train/' + str(i)
        with open(elem_path, 'rb') as f:
            elem_tensor = pickle.load(f, encoding='latin1')
        all_data_tensor = torch.cat((all_data_tensor, torch.Tensor(elem_tensor).unsqueeze(0).cuda()), dim=0)
        # print(all_data_tensor.shape)
        # print('*')
    print(all_data_tensor.shape)

    # knn
    k = KMEANS(max_iter=10, verbose=False, device=device)
    classifier_result = k.fit(all_data_tensor.cuda())
    # print(classifier_result[:10])
    print(classifier_result.shape)
    classifier_result = classifier_result.cpu().numpy()

    # save result (img_id : label)
    dict = {0:all_data_list, 1:classifier_result}
    pd.DataFrame(dict).to_csv('k_means_all_ava_data_label.txt', sep=' ', header=None, index=False)


    # speed = time_clock(matrix, device)
    # print(speed)
    # cpu_speeds.append(speed)
    # l1, = plt.plot(2048, cpu_speeds,color = 'r',label = 'CPU')

    # device = choose_device(True)
    #
    # gpu_speeds = []
    # for i in tqdm([20, 100, 500, 2000, 8000, 20000]):
    #     matrix = torch.rand((250000, i)).to(device)
    #     speed = time_clock(matrix,device)
    #     gpu_speeds.append(speed)
    # l2, = plt.plot([20, 100, 500, 2000, 8000, 20000], gpu_speeds, color='g',label = "GPU")

    # plt.xlabel("num_features")
    # plt.ylabel("speed(s/iter)")
    # plt.title("Speed with cuda")
    # plt.legend(handles = [l1],labels = ['GPU'],loc='best')
    # plt.savefig("speed.jpg")

核心:
1.kmeans思路:

  • 随机初始化中心点
  • 计算各数据到中心点距离,重新分类
  • 根据分类计算均值,重新确定中心点
  • 迭代n次,直到分类稳定

2.实现

  • 将核心计算:计算欧氏距离和均值两个步骤使用torch.gpu计算
  • numpy转化为tensor
  • 5
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
K-Means 聚类算法是一种基于距离的聚类算法,它将数据集分成 K 个簇,每个簇的中心点就是该簇内所有点的质心。K-Means 聚类算法的步骤如下: 1. 随机选择 K 个点作为簇的中心点。 2. 计算每个样本点到 K 个中心点的距离,将其分配到最近的簇中。 3. 对于每个簇,重新计算该簇的中心点。 4. 重复步骤 2 和 3,直到簇不再发生变化或达到最大迭代次数。 下面是使用 K-Means 聚类算法对鸢尾花数据进行聚类的 Python 代码: ``` python import numpy as np from sklearn.cluster import KMeans from sklearn.datasets import load_iris import matplotlib.pyplot as plt # 加载鸢尾花数据集 iris = load_iris() X = iris.data # 定义 K 值和最大迭代次数 k = 3 max_iter = 300 # 使用 K-Means 聚类算法 kmeans = KMeans(n_clusters=k, max_iter=max_iter) kmeans.fit(X) # 获取聚类结果和簇的中心点 labels = kmeans.labels_ centers = kmeans.cluster_centers_ # 绘制聚类结果图 plt.scatter(X[:, 0], X[:, 1], c=labels) plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='*', s=200) plt.title('K-Means Clustering') plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.show() ``` 上述代码中,首先加载鸢尾花数据集,然后定义 K 值和最大迭代次数。接着使用 K-Means 聚类算法,获取聚类结果和簇的中心点。最后绘制聚类结果图,其中样本点的颜色表示它所属的簇,红色星号表示每个簇的中心点。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值