【聚类算法】层次聚类

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
 
// 定义数据点结构
typedef struct {
    double coordinates[2];
} Point;
 
// 定义单链表节点结构
typedef struct Node {
    Point point;
    struct Node* next;
} Node;
 
// 计算两点之间的欧式距离
double euclideanDistance(Point p1, Point p2) {
    return sqrt(pow(p1.coordinates[0] - p2.coordinates[0], 2) + 
                pow(p1.coordinates[1] - p2.coordinates[1], 2));
}
 
// 创建节点
Node* createNode(Point point) {
    Node* newNode = (Node*)malloc(sizeof(Node));
    if (newNode == NULL) exit(1); // 内存分配失败
    newNode->point = point;
    newNode->next = NULL;
    return newNode;
}
 
// 向链表中添加节点
void addNode(Node** head, Point point) {
    Node* newNode = createNode(point);
    newNode->next = *head;
    *head = newNode;
}
 
// 层次聚类示例函数
Node* hierarchicalClustering(Node** points, int k, double (*distanceFunc)(Point, Point)) {
    // 实现层次聚类算法的逻辑
    // 这里仅提供一个示例框架，具体实现需要根据算法细节来
    Node* clusters = NULL;
    // ... 算法实现 ...
    return clusters;
}
 
int main() {
    // 示例数据点
    Point points[] = { {1, 1}, {1, 2}, {2, 1}, {2, 2}, {3, 3}, {4, 4} };
    int numPoints = sizeof(points) / sizeof(points[0]);
 
    // 创建节点链表
    Node** pointList = (Node**)malloc(numPoints * sizeof(Node*));
    for (int i = 0; i < numPoints; ++i) {
        pointList[i] = createNode(points[i]);
    }
 
    // 执行层次聚类
    int k = 2; // 假设我们想要的聚类数
    Node* clusters = hierarchicalClustering(pointList, k, euclideanDistance);
 
    // 输出聚类结果
    while (clusters) {
        printf("Cluster: ");
        Node* cluster = clusters;
        while (cluster) {
            printf("(%f, %f) ", cluster->point.coordinates[0], cluster->point.coordinates[1]);
            cluster = cluster->next;
        }
        printf("\n");
        clusters = clusters->next;
    }
 
    // 清理内存
    for (int i = 0; i < numPoints; ++i) {
        free(pointList[i]);
    }
    free(pointList);
    return 0;
}

这个例子提供了一个简化的层次聚类算法的框架，并展示了如何使用单链表来存储数据点和聚类结果。在实际应用中，你需要根据具体的算法细节来填充hierarchicalClustering函数的实现。

3.2 层次聚类算法JAVA实现

下面是一个简单的Java实现，使用了上述代码中描述的hierarchicalCluster函数的核心逻辑。请注意，这里省略了数据读取和显示的部分，只关注聚类算法的实现。

import java.util.ArrayList;
import java.util.List;
 
public class HierarchicalClustering {
 
    public static class Cluster {
        public double distance;
        public List<Integer> items;
 
        public Cluster(double distance, List<Integer> items) {
            this.distance = distance;
            this.items = items;
        }
    }
 
    public static List<Cluster> hierarchicalCluster(List<double[]> data) {
        List<Cluster> clusters = new ArrayList<>();
        for (double[] point : data) {
            clusters.add(new Cluster(0.0, List.of((int) point[0])));
        }
 
        while (clusters.size() > 1) {
            double minDistance = Double.POSITIVE_INFINITY;
            int i = -1, j = -1;
            for (int a = 0; a < clusters.size(); a++) {
                for (int b = a + 1; b < clusters.size(); b++) {
                    double distance = calculateDistance(clusters.get(a).items, clusters.get(b).items, data);
                    if (distance < minDistance) {
                        minDistance = distance;
                        i = a;
                        j = b;
                    }
                }
            }
 
            // Merge the two closest clusters
            List<Integer> mergedItems = new ArrayList<>(clusters.get(i).items);
            mergedItems.addAll(clusters.get(j).items);
            Cluster mergedCluster = new Cluster(minDistance, mergedItems);
 
            clusters.remove(i);
            clusters.remove(j > i ? j - 1 : j);
            clusters.add(mergedCluster);
        }
 
        return clusters;
    }
 
    private static double calculateDistance(List<Integer> a, List<Integer> b, List<double[]> data) {
        // 计算两个集群间的距离，这里使用示例中的方法，实际应用中可能需要不同的距离计算方法
        double sum = 0.0;
        for (int itemA : a) {
            for (int itemB : b) {
                double[] pointA = data.get(itemA);
                double[] pointB = data.get(itemB);
                for (int i = 1; i < pointA.length; i++) {
                    sum += Math.pow(pointA[i] - pointB[i], 2);
                }
            }
        }
        return Math.sqrt(sum) / a.size();
    }
 
    // 示例用数据
    public static List<double[]> getSampleData() {
        return List.of(
            new double[]{1, 1.0, 1.0},
            new double[]{2, 2.0, 2.0},
            new double[]{3, 3.0, 3.0},
            new double[]{4, 4.0, 4.0},
            new double[]{5, 5.0, 5.0}
        );
    }
 
    public static void main(String[] args) {
        List<double[]> data = getSampleData();
        List<Cluster> clusters = hierarchicalCluster(data);
        // 输出聚类结果
        for (Cluster cluster : clusters) {
            System.out.println("Cluster distance: " + cluster.distance);
            System.out.println("Cluster items: " + cluster.items);
        }
    }
}

这段代码实现了层次聚类算法的核心函数，并提供了一个简单的示例数据集来演示聚类过程。

3.3 层次聚类算法python实现

下面是一个简单的层次聚类算法的Python实现示例，使用了scipy库中的linkage函数和dendrogram函数来创建聚类和绘制树状图：

import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
 
# 假设有一个数据集data
data = np.array([[1, 2], [0, 4], [2, 3], [0, 5], [6, 7], [8, 9], [5, 10]])
 
# 计算数据点之间的距离
def calculate_distance(data):
    n = data.shape[0]
    distances = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            distances[i, j] = distances[j, i] = np.linalg.norm(data[i] - data[j], ord=2)
    return distances
 
# 计算数据点的距离矩阵
distances = calculate_distance(data)
 
# 使用scipy的linkage函数进行层次聚类
Z = linkage(distances, method='single')
 
# 绘制树状图
dendrogram(Z)
plt.show()

这段代码首先定义了一个数据集，然后使用calculate_distance函数计算数据点之间的欧氏距离。接着，使用scipy的linkage函数进行层次聚类，并通过dendrogram函数绘制树状图。这个示例提供了一个简单的层次聚类算法的实现，并展示了如何使用scipy进行聚类分析。