/** This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/
/** FCM.java
* Copyright (C) 2007 Wei Xiaofei
**/package weka.clusterers;
import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.core.Capabilities.Capability;
import weka.core.matrix.Matrix;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Random;
import java.util.Vector;/**
* Cluster data using the Fuzzy C means algorithm
*
*
* Valid options are:
*
*
-N <num>
* number of clusters.
* (default 2).
*
*
-F <num>
* exponent.
* (default 2).
*
*
-S <num>
* Random number seed.
* (default 10)
*
*
* @author Wei Xiaofei
* @version 1.03
* @see RandomizableClusterer*/
public classFuzzyCMeans
extends RandomizableClusterer
implements NumberOfClustersRequestable, WeightedInstancesHandler {/** for serialization*/
static final long serialVersionUID = -2134543132156464L;/**
* replace missing values in training instances
* 替换训练集中的缺省值*/
privateReplaceMissingValues m_ReplaceMissingFilter;/**
* number of clusters to generate
* 产生聚类的个数*/
private int m_NumClusters = 2;/**
* D: d(i,j)=||c(i)-x(j)||为第i个聚类中心与第j个数据点间的欧几里德距离*/
privateMatrix D;//private Matrix U;
/**
* holds the fuzzifier
* 模糊算子(加权指数)*/
private double m_fuzzifier = 2;/**
* holds the cluster centroids
* 聚类中心*/
privateInstances m_ClusterCentroids;/**
* Holds the standard deviations of the numeric attributes in each cluster
* 每个聚类的标准差*/
privateInstances m_ClusterStdDevs;/**
* For each cluster, holds the frequency counts for the values of each
* nominal attribute*/
private int[][][] m_ClusterNominalCounts;/**
* The number of instances in each cluster
* 每个聚类包含的实例个数*/
private int[] m_ClusterSizes;/**
* attribute min values
* 属性最小值*/
private double[] m_Min;/**
* attribute max values
* 属性最大值*/
private double[] m_Max;/**
* Keep track of the number of iterations completed before convergence
* 迭代次数*/
private int m_Iterations = 0;/**
* Holds the squared errors for all clusters
* 平方误差*/
private double[] m_squaredErrors;/**
* the default constructor
* 初始构造器*/
publicFuzzyCMeans () {
super();
m_SeedDefault= 10;//初始化种子个数
setSeed(m_SeedDefault);
}/**
* Returns a string describing this clusterer
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
* 全局信息, 在图形介面显示*/
publicString globalInfo() {return "Cluster data using the fuzzy k means algorithm";
}/**
* Returns default capabilities of the clusterer.
*
* @return the capabilities of this clusterer
* 聚类容器*/
publicCapabilities getCapabilities() {
Capabilities result=super.getCapabilities();
result.disableAll();
result.enable(Capability.NO_CLASS);//attributes
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);returnresult;
}/**
* Generates a clusterer. Has to initialize all fields of the clusterer
* that are not being set via options.
*
* @param data set of instances serving as training data
* @throws Exception if the clusterer has not been
* generated successfully
* 聚类产生函数*/
public voidbuildClusterer(Instances data) throws Exception {//can clusterer handle the data?检测数据能否聚类
getCapabilities().testWithFail(data);
m_Iterations= 0;
m_ReplaceMissingFilter= newReplaceMissingValues();
Instances instances= new Instances(data);//实例
instances.setClassIndex(-1);
m_ReplaceMissingFilter.setInputFormat(instances);
instances=Filter.useFilter(instances, m_ReplaceMissingFilter);
m_Min= new double[instances.numAttributes()];
m_Max= new double[instances.numAttributes()];for (int i = 0; i < instances.numAttributes(); i++) {
m_Min[i]= m_Max[i] = Double.NaN;//随机分配不定值
}
m_ClusterCentroids= new Instances(instances, m_NumClusters);//聚类中心
int[] clusterAssignments = new int[instances.numInstances()];for (int i = 0; i < instances.numInstances(); i++) {
updateMinMax(instances.instance(i));//更新最大最小值
}
Random RandomO= new Random(getSeed());//随机数
intinstIndex;
HashMap initC= newHashMap();
DecisionTableHashKey hk= null;/*利用决策表随机生成聚类中心*/
for (int j = instances.numInstances() - 1; j >= 0; j--) {
instIndex= RandomO.nextInt(j+1);
hk= newDecisionTableHashKey(instances.instance(instIndex),
instances.numAttributes(),true);if (!initC.containsKey(hk)) {
m_ClusterCentroids.add(instances.instance(instIndex));
initC.put(hk,null);
}
instances.swap(j, instIndex);if (m_ClusterCentroids.numInstances() ==m_NumClusters) {break;
}
}
m_NumClusters= m_ClusterCentroids.numInstances();//聚类个数=聚类中心个数
D= new Matrix(solveD(instances).getArray());//求聚类中心到每个实例的距离
inti, j;int n =instances.numInstances();
Instances [] tempI= newInstances[m_NumClusters];
m_squaredErrors= new double[m_NumClusters];
m_ClusterNominalCounts= new int [m_NumClusters][instances.numAttributes()][0];
Matrix U= new Matrix(solveU(instances).getArray());//初始化隶属矩阵U
double q = 0;//初始化价值函数值
while (true) {
m_Iterations++;for (i = 0; i < instances.numInstances(); i++) {
Instance toCluster=instances.instance(i);int newC = clusterProcessedInstance(toCluster, true);//聚类处理实例,即输入的实例应该聚到哪一个簇?!
clusterAssignments[i]=newC;
}//update centroids 更新聚类中心
m_ClusterCentroids = newInstances(instances, m_NumClusters);for (i = 0; i < m_NumClusters; i++) {
tempI[i]= new Instances(instances, 0);
}for (i = 0; i < instances.numInstances(); i++) {
tempI[clusterAssignments[i]].add(instances.instance(i));
}for (i = 0; i < m_NumClusters; i++) {double[] vals = new double[instances.numAttributes()];for (j = 0; j < instances.numAttributes(); j++) {double sum1 = 0, sum2 = 0;for (int k = 0; k < n; k++) {
sum1+= U.get(i, k) * U.get(i, k) *instances.instance(k).value(j);
sum2+= U.get(i, k) * U.get(i, k);
}
vals[j]= sum1 /sum2;
}
m_ClusterCentroids.add(new Instance(1.0, vals));
}
D= newMatrix(solveD(instances).getArray());
U= new Matrix(solveU(instances).getArray());//计算新的聿属矩阵U
double q1 = 0;//新的价值函数值
for (i = 0; i < m_NumClusters; i++) {for (j = 0; j < n; j++) {/*计算价值函数值 即q1 += U(i,j)^m * d(i,j)^2*/q1+= Math.pow(U.get(i, j), getFuzzifier()) * D.get(i, j) * D.get(i, j);
}
}/*上次价值函数值的改变量(q1 -q)小于某个阀值(这里用机器精度:2.2204e-16)*/
if (q1 - q < 2.2204e-16) {break;
}
q=q1;
}/*计算标准差 跟K均值一样*/m_ClusterStdDevs= newInstances(instances, m_NumClusters);
m_ClusterSizes= new int[m_NumClusters];for (i = 0; i < m_NumClusters; i++) {double [] vals2 = new double[instances.numAttributes()];for (j = 0; j < instances.numAttributes(); j++) {if (instances.attribute(j).isNumeric()) {//判断属性是否是数值型的?!
vals2[j] =Math.sqrt(tempI[i].variance(j));
}else{
vals2[j]=Instance.missingValue();
}
}
m_ClusterStdDevs.add(new Instance(1.0, vals2));//1.0代表权值, vals2代表属性值
m_ClusterSizes[i] =tempI[i].numInstances();
}
}/**
* clusters an instance that has been through the filters
*
* @param instance the instance to assign a cluster to
* @param updateErrors if true, update the within clusters sum of errors
* @return a cluster number
* 聚类一个实例, 返回实例应属于哪一个簇的编号
* 首先计算输入的实例到所有聚类中心的距离, 哪里距离最小
* 这个实例就属于哪一个聚类中心所在簇*/
private intclusterProcessedInstance(Instance instance, boolean updateErrors) {double minDist =Integer.MAX_VALUE;int bestCluster = 0;for (int i = 0; i < m_NumClusters; i++) {double dist =distance(instance, m_ClusterCentroids.instance(i));if (dist
minDist=dist;
bestCluster=i;
}
}if(updateErrors) {
m_squaredErrors[bestCluster]+=minDist;
}returnbestCluster;
}/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an interger
* if the class is enumerated, otherwise the predicted value
* @throws Exception if instance could not be classified
* successfully
* 分类一个实例, 调用clusterProcessedInstance()函数*/
public intclusterInstance(Instance instance) throws Exception {
m_ReplaceMissingFilter.input(instance);
m_ReplaceMissingFilter.batchFinished();
Instance inst=m_ReplaceMissingFilter.output();return clusterProcessedInstance(inst, false);
}/**
* 计算矩阵D, 即 d(i,j)=||c(i)-x(j)||*/
privateMatrix solveD(Instances instances) {int n =instances.numInstances();
Matrix D= newMatrix(m_NumClusters, n);for (int i = 0; i < m_NumClusters; i++) {for (int j = 0; j < n; j++) {
D.set(i, j, distance(instances.instance(j), m_ClusterCentroids.instance(i)));if (D.get(i, j) == 0) {
D.set(i, j, 0.000000000001);
}
}
}returnD;
}/**
* 计算聿属矩阵U, 即U(i,j) = 1 / sum(d(i,j)/ d(k,j))^(2/(m-1)*/
privateMatrix solveU(Instances instances) {int n =instances.numInstances();inti, j;
Matrix U= newMatrix(m_NumClusters, n);for (i = 0; i < m_NumClusters; i++) {for (j = 0; j < n; j++) {double sum = 0;for (int k = 0; k < m_NumClusters; k++) {//d(i,j)/d(k,j)^(2/(m-1)
sum += Math.pow(D.get(i, j) / D.get(k, j), 2 /(getFuzzifier() - 1));
}
U.set(i, j, Math.pow(sum, -1));
}
}returnU;
}/**
* Calculates the distance between two instances
*
* @param first the first instance
* @param second the second instance
* @return the distance between the two given instances
* 计算两个实例之间的距离, 返回欧几里德距离*/
private doubledistance(Instance first, Instance second) {doubleval1;doubleval2;double dist = 0.0;for (int i = 0; i
val1=first.value(i);
val2=second.value(i);
dist+= (val1 - val2) * (val1 -val2);
}
dist=Math.sqrt(dist);returndist;
}/**
* Updates the minimum and maximum values for all the attributes
* based on a new instance.
*
* @param instance the new instance
* 更新所有属性最大最小值, 跟K均值里的函数一样*/
private voidupdateMinMax(Instance instance) {for (int j = 0;j < m_ClusterCentroids.numAttributes(); j++) {if (!instance.isMissing(j)) {if(Double.isNaN(m_Min[j])) {
m_Min[j]=instance.value(j);
m_Max[j]=instance.value(j);
}else{if (instance.value(j)
m_Min[j]=instance.value(j);
}else{if (instance.value(j) >m_Max[j]) {
m_Max[j]=instance.value(j);
}
}
}
}
}
}/**
* Returns the number of clusters.
*
* @return the number of clusters generated for a training dataset.
* @throws Exception if number of clusters could not be returned
* successfully
* 返回聚类个数*/
public intnumberOfClusters() throws Exception {returnm_NumClusters;
}/**
* 返回模糊算子, 即加权指数
*
* @return 加权指数
* @throws Exception 加权指数不能成功返回*/
public doublefuzzifier() throws Exception {returnm_fuzzifier;
}/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
* 返回一个枚举描述的活动选项(菜单)*/
publicEnumeration listOptions () {
Vector result= newVector();
result.addElement(newOption("\tnumber of clusters.\n"
+ "\t(default 2).","N", 1, "-N "));
result.addElement(newOption("\texponent.\n"
+ "\t(default 2.0).","F", 1, "-F "));
Enumeration en=super.listOptions();while(en.hasMoreElements())
result.addElement(en.nextElement());returnresult.elements();
}/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
* 返回文本信息*/
publicString numClustersTipText() {return "set number of clusters";
}/**
* set the number of clusters to generate
*
* @param n the number of clusters to generate
* @throws Exception if number of clusters is negative
* 设置聚类个数*/
public void setNumClusters(intn) throws Exception {if (n <= 0) {throw new Exception("Number of clusters must be > 0");
}
m_NumClusters=n;
}/**
* gets the number of clusters to generate
*
* @return the number of clusters to generate
* 取聚类个数*/
public intgetNumClusters() {returnm_NumClusters;
}/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
* 返回文本信息*/
publicString fuzzifierTipText() {return "set fuzzifier";
}/**
* set the fuzzifier
*
* @param f fuzzifier
* @throws Exception if exponent is negative
* 设置模糊算子*/
public void setFuzzifier(doublef) throws Exception {if (f <= 1) {throw new Exception("F must be > 1");
}
m_fuzzifier=f;
}/**
* get the fuzzifier
*
* @return m_fuzzifier
* 取得模糊算子*/
public doublegetFuzzifier() {returnm_fuzzifier;
}/**
* Parses a given list of options.
*
* Valid options are:
*
*
-N <num>
* number of clusters.
* (default 2).
*
*
-F <num>
* fuzzifier.
* (default 2.0).
*
*
-S <num>
* Random number seed.
* (default 10)
*
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
* 设置活动选项*/
public voidsetOptions (String[] options)
throws Exception {
String optionString= Utils.getOption('N', options);if (optionString.length() != 0) {
setNumClusters(Integer.parseInt(optionString));
}
optionString= Utils.getOption('F', options);if (optionString.length() != 0) {
setFuzzifier((newDouble(optionString)).doubleValue());
}
super.setOptions(options);
}/**
* Gets the current settings of FuzzyCMeans
*
* @return an array of strings suitable for passing to setOptions()
* 取得活动选项*/
publicString[] getOptions () {inti;
Vector result;
String[] options;
result= newVector();
result.add("-N");
result.add("" +getNumClusters());
result.add("-F");
result.add("" +getFuzzifier());
options=super.getOptions();for (i = 0; i < options.length; i++)
result.add(options[i]);return (String[]) result.toArray(newString[result.size()]);
}/**
* return a string describing this clusterer
*
* @return a description of the clusterer as a string
* 结果显示*/
publicString toString() {int maxWidth = 0;for (int i = 0; i < m_NumClusters; i++) {for (int j = 0 ;j < m_ClusterCentroids.numAttributes(); j++) {if(m_ClusterCentroids.attribute(j).isNumeric()) {double width = Math.log(Math.abs(m_ClusterCentroids.instance(i).value(j))) /Math.log(10.0);
width+= 1.0;if ((int)width >maxWidth) {
maxWidth= (int)width;
}
}
}
}
StringBuffer temp= newStringBuffer();
String naString= "N/A";for (int i = 0; i < maxWidth+2; i++) {
naString+= " ";
}
temp.append("\nFuzzy C-means\n======\n");
temp.append("\nNumber of iterations:" + m_Iterations+"\n");
temp.append("Within cluster sum of squared errors:" +Utils.sum(m_squaredErrors));
temp.append("\n\nCluster centroids:\n");for (int i = 0; i < m_NumClusters; i++) {
temp.append("\nCluster"+i+"\n\t");
temp.append("\n\tStd Devs:");for (int j = 0; j < m_ClusterStdDevs.numAttributes(); j++) {if(m_ClusterStdDevs.attribute(j).isNumeric()) {
temp.append(" "+Utils.doubleToString(m_ClusterStdDevs.instance(i).value(j),
maxWidth+5, 4));
}else{
temp.append(" "+naString);
}
}
}
temp.append("\n\n");returntemp.toString();
}/**
* Gets the the cluster centroids
*
* @return the cluster centroids
* 取得聚类中心*/
publicInstances getClusterCentroids() {returnm_ClusterCentroids;
}/**
* Gets the standard deviations of the numeric attributes in each cluster
*
* @return the standard deviations of the numeric attributes
* in each cluster
* 聚得标准差*/
publicInstances getClusterStandardDevs() {returnm_ClusterStdDevs;
}/**
* Returns for each cluster the frequency counts for the values of each
* nominal attribute
*
* @return the counts*/
public int[][][] getClusterNominalCounts() {returnm_ClusterNominalCounts;
}/**
* Gets the squared error for all clusters
*
* @return the squared error
* 取得平方差*/
public doublegetSquaredError() {returnUtils.sum(m_squaredErrors);
}/**
* Gets the number of instances in each cluster
*
* @return The number of instances in each cluster
* 取每个簇的实例个数*/
public int[] getClusterSizes() {returnm_ClusterSizes;
}/**
* Main method for testing this class.
*
* @param argv should contain the following arguments:
* -t training file [-N number of clusters]
* 主函数*/
public static voidmain (String[] argv) {
runClusterer(newFuzzyCMeans (), argv);
}
}