K-means 算法

hit2015spring

欢迎关注我的博客:http://blog.csdn.NET/hit2015spring

前期预备知识

在无监督的算法中,训练样本的标记信息是未知的,目标是通过对训练样本学习来揭示数据的内在性质和规律。聚类试图将数据集中的样本划分为若干个通常是不相交的子集,每个子集称为一个簇,就是一堆不知道标签的数据样本,这些样本中每一个都包含着一个 n 维特征向量xi=(xi1,xi2,,xin)

就是描述一个事物它具有n个特征,这些特征可以反映出一个物体它属于哪个类别。于是聚类算法将这些样本D划分为 k 个不相交的簇。例如有一群人,有穿红衣服的长头发,有绿衣服的长头发,白衣服短头发,黑衣服长头发。。。。。。简单划分为男生女生,这里要满足的一个度量指标就是wom衣服颜色和头发长短就是特征的两维。只是一群人,我们通过这些特征之间的联系来把他们分成为两类人。

(当然这个男生女生的标签是我们自己加的,在k-means聚类的过程中算法是不知道这个标签的,它只是根据这些特征的联系(就是距离)把认为是同一类的样本聚集在一起)。

这里面引入了距离的定义:
对于两个样本:xi=(xi1,xi2,,xin) xj=(xj1,xj2,,xjn) , 两个样本之间求距离是:

distmk(xi,xj)=(u=1n|xiuxju|p)1p

表达式(1)叫做闵可夫斯基距离

p=2 时,为欧氏距离

disted(xi,xj)=xixj2=u=1n|xiuxju|2

p=1 时,为曼哈顿距离

distman(xi,xj)=xixj2=u=1n|xiuxju|

当然上述的属性度量是基于这些属性是有“序”的关系。就像:属性值为(1,2,3)1和3距离比较远,和2距离比较近。具体可以用具体的值度量的。当然还有无序的属性,就像:{红衣服,黑衣服,蓝衣服}这样的属性我们不能直接用属性的值进行计算,这里就用到了VDM距离进行计算。具体可以见西瓜书的描述p200.

k均值聚类

给定样本集 D=x1,x2,,xm , k 均值算法对聚类所得到的簇划分C=C1,C2,,Ck,该算法可以使得平方误差最小化:

E=i=1kxCixμi22

这里面 μi=1|Ci|xCix 其实就是分类簇 Ci 的均值向量。当然了E值越小,表明簇内部的样本越相似。

可是要得到这个最小化的解其实是很不容易的,于是k均值用的是一个贪心算法进行近似求解的。

伪代码如下:
1、根据事先选择好的k值,随机在原始样本中选择初值,这些初值就当做是k个中心

2、对所有的点 1,2,,m ,计算每个点跟这k个中心的距离。

3、每个点都能得到k个距离,选取最近的那个距离,把这个点归到该类别。

4、这下得到了这k个簇里面都有一些点了吧,计算这些点的中心点,然后更新一下这些k个簇的中心。

5、是否满足你要求的迭代条件,如果没有满足条件,从第2步继续重复。

具体的一个例子

例子
这里写图片描述
这里写图片描述
这里写图片描述

c++代码

#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include "k-means.h"
using namespace std;


KMeans::KMeans(int dimNum, int clusterNum)
{
    m_dimNum = dimNum;
    m_clusterNum = clusterNum;

    m_means = new double*[m_clusterNum];
    for(int i = 0; i < m_clusterNum; i++)
    {
        m_means[i] = new double[m_dimNum];
        memset(m_means[i], 0, sizeof(double) * m_dimNum);
    }

    m_initMode = InitRandom;
    m_maxIterNum = 100;
    m_endError = 0.001;
}

KMeans::~KMeans()
{
    for(int i = 0; i < m_clusterNum; i++)
    {
        delete[] m_means[i];
    }
    delete[] m_means;
}

void KMeans::Cluster(const char* sampleFileName, const char* labelFileName)
{
    // Check the sample file
    ifstream sampleFile(sampleFileName, ios_base::binary);
    assert(sampleFile);

    int size = 0;
    int dim = 0;
    sampleFile.read((char*)&size, sizeof(int));
    sampleFile.read((char*)&dim, sizeof(int));
    assert(size >= m_clusterNum);
    assert(dim == m_dimNum);

    // Initialize model
    Init(sampleFile);

    // Recursion
    double* x = new double[m_dimNum];   // Sample data
    int label = -1;     // Class index
    double iterNum = 0;
    double lastCost = 0;
    double currCost = 0;
    int unchanged = 0;
    bool loop = true;
    int* counts = new int[m_clusterNum];
    double** next_means = new double*[m_clusterNum];    // New model for reestimation
    for(int i = 0; i < m_clusterNum; i++)
    {
        next_means[i] = new double[m_dimNum];
    }

    while(loop)
    {
        //clean buffer for classification
        memset(counts, 0, sizeof(int) * m_clusterNum);
        for(int i = 0; i < m_clusterNum; i++)
        {
            memset(next_means[i], 0, sizeof(double) * m_dimNum);
        }

        lastCost = currCost;
        currCost = 0;

        sampleFile.clear();
        sampleFile.seekg(sizeof(int) * 2, ios_base::beg);

        // Classification
        for(int i = 0; i < size; i++)
        {
            sampleFile.read((char*)x, sizeof(double) * m_dimNum);
            currCost += GetLabel(x, &label);

            counts[label]++;
            for(int d = 0; d < m_dimNum; d++)
            {
                next_means[label][d] += x[d];
            }
        }
        currCost /= size;

        // Reestimation
        for(int i = 0; i < m_clusterNum; i++)
        {
            if(counts[i] > 0)
            {
                for(int d = 0; d < m_dimNum; d++)
                {
                    next_means[i][d] /= counts[i];
                }
                memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum);
            }
        }

        // Terminal conditions
        iterNum++;
        if(fabs(lastCost - currCost) < m_endError * lastCost)
        {
            unchanged++;
        }
        if(iterNum >= m_maxIterNum || unchanged >= 3)
        {
            loop = false;
        }
        //DEBUG
        //cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl;
    }

    // Output the label file
    ofstream labelFile(labelFileName, ios_base::binary);
    assert(labelFile);

    labelFile.write((char*)&size, sizeof(int));
    sampleFile.clear();
    sampleFile.seekg(sizeof(int) * 2, ios_base::beg);

    for(int i = 0; i < size; i++)
    {
        sampleFile.read((char*)x, sizeof(double) * m_dimNum);
        GetLabel(x, &label);
        labelFile.write((char*)&label, sizeof(int));
    }

    sampleFile.close();
    labelFile.close();

    delete[] counts;
    delete[] x;
    for(int i = 0; i < m_clusterNum; i++)
    {
        delete[] next_means[i];
    }
    delete[] next_means;
}

//N 为特征向量数
void KMeans::Cluster(double *data, int N, int *Label)
{
    int size = 0;
    size = N;

    assert(size >= m_clusterNum);

    // Initialize model
    Init(data,N);

    // Recursion
    double* x = new double[m_dimNum];   // Sample data
    int label = -1;     // Class index
    double iterNum = 0;
    double lastCost = 0;
    double currCost = 0;
    int unchanged = 0;
    bool loop = true;
    int* counts = new int[m_clusterNum];
    double** next_means = new double*[m_clusterNum];    // New model for reestimation
    for(int i = 0; i < m_clusterNum; i++)
    {
        next_means[i] = new double[m_dimNum];
    }

    while(loop)
    {
        //clean buffer for classification
        memset(counts, 0, sizeof(int) * m_clusterNum);
        for(int i = 0; i < m_clusterNum; i++)
        {
            memset(next_means[i], 0, sizeof(double) * m_dimNum);
        }

        lastCost = currCost;
        currCost = 0;

        // Classification
        for(int i = 0; i < size; i++)
        {
            for(int j = 0; j < m_dimNum; j++)
                x[j] = data[i*m_dimNum+j];

            currCost += GetLabel(x, &label);

            counts[label]++;
            for(int d = 0; d < m_dimNum; d++)
            {
                next_means[label][d] += x[d];
            }
        }
        currCost /= size;

        // Reestimation
        for(int i = 0; i < m_clusterNum; i++)
        {
            if(counts[i] > 0)
            {
                for(int d = 0; d < m_dimNum; d++)
                {
                    next_means[i][d] /= counts[i];
                }
                memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum);
            }
        }

        // Terminal conditions
        iterNum++;
        if(fabs(lastCost - currCost) < m_endError * lastCost)
        {
            unchanged++;
        }
        if(iterNum >= m_maxIterNum || unchanged >= 3)
        {
            loop = false;
        }

        //DEBUG
        //cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl;
    }

    // Output the label file
    for(int i = 0; i < size; i++)
    {
        for(int j = 0; j < m_dimNum; j++)
            x[j] = data[i*m_dimNum+j];
        GetLabel(x,&label);
        Label[i] = label;
    }
    delete[] counts;
    delete[] x;
    for(int i = 0; i < m_clusterNum; i++)
    {
        delete[] next_means[i];
    }
    delete[] next_means;
}

void KMeans::Init(double *data, int N)
{
    int size = N;

    if(m_initMode ==  InitRandom)
    {
        int inteval = size / m_clusterNum;
        double* sample = new double[m_dimNum];

        // Seed the random-number generator with current time
        srand((unsigned)time(NULL));

        for(int i = 0; i < m_clusterNum; i++)
        {
            int select = inteval * i + (inteval - 1) * rand() / RAND_MAX;
            for(int j = 0; j < m_dimNum; j++)
                sample[j] = data[select*m_dimNum+j];
            memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
        }

        delete[] sample;
    }
    else if(m_initMode == InitUniform)
    {
        double* sample = new double[m_dimNum];

        for(int i = 0; i < m_clusterNum; i++)
        {
            int select = i * size / m_clusterNum;
            for(int j = 0; j < m_dimNum; j++)
                sample[j] = data[select*m_dimNum+j];
            memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
        }

        delete[] sample;
    }
    else if(m_initMode == InitManual)
    {
        // Do nothing
    }
}

void KMeans::Init(ifstream& sampleFile)
{
    int size = 0;
    sampleFile.seekg(0, ios_base::beg);
    sampleFile.read((char*)&size, sizeof(int));

    if(m_initMode ==  InitRandom)
    {
        int inteval = size / m_clusterNum;
        double* sample = new double[m_dimNum];

        // Seed the random-number generator with current time
        srand((unsigned)time(NULL));

        for(int i = 0; i < m_clusterNum; i++)
        {
            int select = inteval * i + (inteval - 1) * rand() / RAND_MAX;
            int offset = sizeof(int) * 2 + select * sizeof(double) * m_dimNum;

            sampleFile.seekg(offset, ios_base::beg);
            sampleFile.read((char*)sample, sizeof(double) * m_dimNum);
            memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
        }

        delete[] sample;
    }
    else if(m_initMode == InitUniform)
    {
        double* sample = new double[m_dimNum];

        for (int i = 0; i < m_clusterNum; i++)
        {
            int select = i * size / m_clusterNum;
            int offset = sizeof(int) * 2 + select * sizeof(double) * m_dimNum;

            sampleFile.seekg(offset, ios_base::beg);
            sampleFile.read((char*)sample, sizeof(double) * m_dimNum);
            memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
        }

        delete[] sample;
    }
    else if(m_initMode == InitManual)
    {
        // Do nothing
    }
}

double KMeans::GetLabel(const double* sample, int* label)
{
    double dist = -1;
    for(int i = 0; i < m_clusterNum; i++)
    {
        double temp = CalcDistance(sample, m_means[i], m_dimNum);
        if(temp < dist || dist == -1)
        {
            dist = temp;
            *label = i;
        }
    }
    return dist;
}

double KMeans::CalcDistance(const double* x, const double* u, int dimNum)
{
    double temp = 0;
    for(int d = 0; d < dimNum; d++)
    {
        temp += (x[d] - u[d]) * (x[d] - u[d]);
    }
    return sqrt(temp);
}

ostream& operator<<(ostream& out, KMeans& kmeans)
{
    out << "<KMeans>" << endl;
    out << "<DimNum> " << kmeans.m_dimNum << " </DimNum>" << endl;
    out << "<ClusterNum> " << kmeans.m_clusterNum << " </CluterNum>" << endl;

    out << "<Mean>" << endl;
    for(int i = 0; i < kmeans.m_clusterNum; i++)
    {
        for(int d = 0; d < kmeans.m_dimNum; d++)
        {
            out << kmeans.m_means[i][d] << " ";
        }
        out << endl;
    }
    out << "</Mean>" << endl;

    out << "</KMeans>" << endl;
    return out;
}

#pragma once
#include <fstream>

class KMeans
{
public:
    enum InitMode
    {
        InitRandom,
        InitManual,
        InitUniform,
    };

    KMeans(int dimNum = 1, int clusterNum = 1);
    ~KMeans();

    void SetMean(int i, const double* u){ memcpy(m_means[i], u, sizeof(double) * m_dimNum); }
    void SetInitMode(int i)             { m_initMode = i; }
    void SetMaxIterNum(int i)           { m_maxIterNum = i; }
    void SetEndError(double f)          { m_endError = f; }

    double* GetMean(int i)  { return m_means[i]; }
    int GetInitMode()       { return m_initMode; }
    int GetMaxIterNum()     { return m_maxIterNum; }
    double GetEndError()    { return m_endError; }


    /*  SampleFile: <size><dim><data>...
        LabelFile:  <size><label>...
    */
    void Cluster(const char* sampleFileName, const char* labelFileName);
    void Init(std::ifstream& sampleFile);
    void Init(double *data, int N);
    void Cluster(double *data, int N, int *Label);
    friend std::ostream& operator<<(std::ostream& out, KMeans& kmeans);

private:
    int m_dimNum;
    int m_clusterNum;
    double** m_means;

    int m_initMode;
    int m_maxIterNum;       // The stopping criterion regarding the number of iterations
    double m_endError;      // The stopping criterion regarding the error

    double GetLabel(const double* x, int* label);
    double CalcDistance(const double* x, const double* u, int dimNum);
};


#include <iostream>
#include "k-means.h"
using namespace std;

int main()
{
    double data[] = {
        0.0, 0.2, 0.4,
        0.3, 0.2, 0.4,
        0.4, 0.2, 0.4,
        0.5, 0.2, 0.4,
        5.0, 5.2, 8.4,
        6.0, 5.2, 7.4,
        4.0, 5.2, 4.4,
        10.3, 10.4, 10.5,
        10.1, 10.6, 10.7,
        11.3, 10.2, 10.9
    };

    const int size = 10; //Number of samples
    const int dim = 3;   //Dimension of feature
    const int cluster_num = 4; //Cluster number

    KMeans* kmeans = new KMeans(dim,cluster_num);
    int* labels = new int[size];
    kmeans->SetInitMode(KMeans::InitUniform);
    kmeans->Cluster(data,size,labels);

    for(int i = 0; i < size; ++i)
    {
        printf("%f, %f, %f belongs to %d cluster\n", data[i*dim+0], data[i*dim+1], data[i*dim+2], labels[i]);
    }

    delete []labels;
    delete kmeans;

    return 0;
}


©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页