weka java 分类算法_机器学习:weka中添加自己的分类和聚类算法

该博客介绍了如何在Weka中添加自定义的模糊C均值聚类算法。文章详细讲解了FuzzyCMeans类的实现,包括聚类中心的初始化、迭代过程、距离计算以及隶属度矩阵的更新等关键步骤,并提供了源代码示例。
摘要由CSDN通过智能技术生成

/** This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License as published by

* the Free Software Foundation; either version 2 of the License, or

* (at your option) any later version.

*

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

*

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/

/** FCM.java

* Copyright (C) 2007 Wei Xiaofei

**/package weka.clusterers;

import weka.classifiers.rules.DecisionTableHashKey;

import weka.core.Capabilities;

import weka.core.Instance;

import weka.core.Instances;

import weka.core.Option;

import weka.core.Utils;

import weka.core.WeightedInstancesHandler;

import weka.core.Capabilities.Capability;

import weka.core.matrix.Matrix;

import weka.filters.Filter;

import weka.filters.unsupervised.attribute.ReplaceMissingValues;

import java.util.Enumeration;

import java.util.HashMap;

import java.util.Random;

import java.util.Vector;/**

* Cluster data using the Fuzzy C means algorithm

*

*

* Valid options are:

*

*

 -N <num>

* number of clusters.

* (default 2).

*

*

 -F <num>

* exponent.

* (default 2).

*

*

 -S <num>

* Random number seed.

* (default 10)

*

*

* @author Wei Xiaofei

* @version 1.03

* @see RandomizableClusterer*/

public classFuzzyCMeans

extends RandomizableClusterer

implements NumberOfClustersRequestable, WeightedInstancesHandler {/** for serialization*/

static final long serialVersionUID = -2134543132156464L;/**

* replace missing values in training instances

* 替换训练集中的缺省值*/

privateReplaceMissingValues m_ReplaceMissingFilter;/**

* number of clusters to generate

* 产生聚类的个数*/

private int m_NumClusters = 2;/**

* D: d(i,j)=||c(i)-x(j)||为第i个聚类中心与第j个数据点间的欧几里德距离*/

privateMatrix D;//private Matrix U;

/**

* holds the fuzzifier

* 模糊算子(加权指数)*/

private double m_fuzzifier = 2;/**

* holds the cluster centroids

* 聚类中心*/

privateInstances m_ClusterCentroids;/**

* Holds the standard deviations of the numeric attributes in each cluster

* 每个聚类的标准差*/

privateInstances m_ClusterStdDevs;/**

* For each cluster, holds the frequency counts for the values of each

* nominal attribute*/

private int[][][] m_ClusterNominalCounts;/**

* The number of instances in each cluster

* 每个聚类包含的实例个数*/

private int[] m_ClusterSizes;/**

* attribute min values

* 属性最小值*/

private double[] m_Min;/**

* attribute max values

* 属性最大值*/

private double[] m_Max;/**

* Keep track of the number of iterations completed before convergence

* 迭代次数*/

private int m_Iterations = 0;/**

* Holds the squared errors for all clusters

* 平方误差*/

private double[] m_squaredErrors;/**

* the default constructor

* 初始构造器*/

publicFuzzyCMeans () {

super();

m_SeedDefault= 10;//初始化种子个数

setSeed(m_SeedDefault);

}/**

* Returns a string describing this clusterer

* @return a description of the evaluator suitable for

* displaying in the explorer/experimenter gui

* 全局信息, 在图形介面显示*/

publicString globalInfo() {return "Cluster data using the fuzzy k means algorithm";

}/**

* Returns default capabilities of the clusterer.

*

* @return the capabilities of this clusterer

* 聚类容器*/

publicCapabilities getCapabilities() {

Capabilities result=super.getCapabilities();

result.disableAll();

result.enable(Capability.NO_CLASS);//attributes

result.enable(Capability.NUMERIC_ATTRIBUTES);

result.enable(Capability.MISSING_VALUES);returnresult;

}/**

* Generates a clusterer. Has to initialize all fields of the clusterer

* that are not being set via options.

*

* @param data set of instances serving as training data

* @throws Exception if the clusterer has not been

* generated successfully

* 聚类产生函数*/

public voidbuildClusterer(Instances data) throws Exception {//can clusterer handle the data?检测数据能否聚类

getCapabilities().testWithFail(data);

m_Iterations= 0;

m_ReplaceMissingFilter= newReplaceMissingValues();

Instances instances= new Instances(data);//实例

instances.setClassIndex(-1);

m_ReplaceMissingFilter.setInputFormat(instances);

instances=Filter.useFilter(instances, m_ReplaceMissingFilter);

m_Min= new double[instances.numAttributes()];

m_Max= new double[instances.numAttributes()];for (int i = 0; i < instances.numAttributes(); i++) {

m_Min[i]= m_Max[i] = Double.NaN;//随机分配不定值

}

m_ClusterCentroids= new Instances(instances, m_NumClusters);//聚类中心

int[] clusterAssignments = new int[instances.numInstances()];for (int i = 0; i < instances.numInstances(); i++) {

updateMinMax(instances.instance(i));//更新最大最小值

}

Random RandomO= new Random(getSeed());//随机数

intinstIndex;

HashMap initC= newHashMap();

DecisionTableHashKey hk= null;/*利用决策表随机生成聚类中心*/

for (int j = instances.numInstances() - 1; j >= 0; j--) {

instIndex= RandomO.nextInt(j+1);

hk= newDecisionTableHashKey(instances.instance(instIndex),

instances.numAttributes(),true);if (!initC.containsKey(hk)) {

m_ClusterCentroids.add(instances.instance(instIndex));

initC.put(hk,null);

}

instances.swap(j, instIndex);if (m_ClusterCentroids.numInstances() ==m_NumClusters) {break;

}

}

m_NumClusters= m_ClusterCentroids.numInstances();//聚类个数=聚类中心个数

D= new Matrix(solveD(instances).getArray());//求聚类中心到每个实例的距离

inti, j;int n =instances.numInstances();

Instances [] tempI= newInstances[m_NumClusters];

m_squaredErrors= new double[m_NumClusters];

m_ClusterNominalCounts= new int [m_NumClusters][instances.numAttributes()][0];

Matrix U= new Matrix(solveU(instances).getArray());//初始化隶属矩阵U

double q = 0;//初始化价值函数值

while (true) {

m_Iterations++;for (i = 0; i < instances.numInstances(); i++) {

Instance toCluster=instances.instance(i);int newC = clusterProcessedInstance(toCluster, true);//聚类处理实例,即输入的实例应该聚到哪一个簇?!

clusterAssignments[i]=newC;

}//update centroids 更新聚类中心

m_ClusterCentroids = newInstances(instances, m_NumClusters);for (i = 0; i < m_NumClusters; i++) {

tempI[i]= new Instances(instances, 0);

}for (i = 0; i < instances.numInstances(); i++) {

tempI[clusterAssignments[i]].add(instances.instance(i));

}for (i = 0; i < m_NumClusters; i++) {double[] vals = new double[instances.numAttributes()];for (j = 0; j < instances.numAttributes(); j++) {double sum1 = 0, sum2 = 0;for (int k = 0; k < n; k++) {

sum1+= U.get(i, k) * U.get(i, k) *instances.instance(k).value(j);

sum2+= U.get(i, k) * U.get(i, k);

}

vals[j]= sum1 /sum2;

}

m_ClusterCentroids.add(new Instance(1.0, vals));

}

D= newMatrix(solveD(instances).getArray());

U= new Matrix(solveU(instances).getArray());//计算新的聿属矩阵U

double q1 = 0;//新的价值函数值

for (i = 0; i < m_NumClusters; i++) {for (j = 0; j < n; j++) {/*计算价值函数值 即q1 += U(i,j)^m * d(i,j)^2*/q1+= Math.pow(U.get(i, j), getFuzzifier()) * D.get(i, j) * D.get(i, j);

}

}/*上次价值函数值的改变量(q1 -q)小于某个阀值(这里用机器精度:2.2204e-16)*/

if (q1 - q < 2.2204e-16) {break;

}

q=q1;

}/*计算标准差 跟K均值一样*/m_ClusterStdDevs= newInstances(instances, m_NumClusters);

m_ClusterSizes= new int[m_NumClusters];for (i = 0; i < m_NumClusters; i++) {double [] vals2 = new double[instances.numAttributes()];for (j = 0; j < instances.numAttributes(); j++) {if (instances.attribute(j).isNumeric()) {//判断属性是否是数值型的?!

vals2[j] =Math.sqrt(tempI[i].variance(j));

}else{

vals2[j]=Instance.missingValue();

}

}

m_ClusterStdDevs.add(new Instance(1.0, vals2));//1.0代表权值, vals2代表属性值

m_ClusterSizes[i] =tempI[i].numInstances();

}

}/**

* clusters an instance that has been through the filters

*

* @param instance the instance to assign a cluster to

* @param updateErrors if true, update the within clusters sum of errors

* @return a cluster number

* 聚类一个实例, 返回实例应属于哪一个簇的编号

* 首先计算输入的实例到所有聚类中心的距离, 哪里距离最小

* 这个实例就属于哪一个聚类中心所在簇*/

private intclusterProcessedInstance(Instance instance, boolean updateErrors) {double minDist =Integer.MAX_VALUE;int bestCluster = 0;for (int i = 0; i < m_NumClusters; i++) {double dist =distance(instance, m_ClusterCentroids.instance(i));if (dist

minDist=dist;

bestCluster=i;

}

}if(updateErrors) {

m_squaredErrors[bestCluster]+=minDist;

}returnbestCluster;

}/**

* Classifies a given instance.

*

* @param instance the instance to be assigned to a cluster

* @return the number of the assigned cluster as an interger

* if the class is enumerated, otherwise the predicted value

* @throws Exception if instance could not be classified

* successfully

* 分类一个实例, 调用clusterProcessedInstance()函数*/

public intclusterInstance(Instance instance) throws Exception {

m_ReplaceMissingFilter.input(instance);

m_ReplaceMissingFilter.batchFinished();

Instance inst=m_ReplaceMissingFilter.output();return clusterProcessedInstance(inst, false);

}/**

* 计算矩阵D, 即 d(i,j)=||c(i)-x(j)||*/

privateMatrix solveD(Instances instances) {int n =instances.numInstances();

Matrix D= newMatrix(m_NumClusters, n);for (int i = 0; i < m_NumClusters; i++) {for (int j = 0; j < n; j++) {

D.set(i, j, distance(instances.instance(j), m_ClusterCentroids.instance(i)));if (D.get(i, j) == 0) {

D.set(i, j, 0.000000000001);

}

}

}returnD;

}/**

* 计算聿属矩阵U, 即U(i,j) = 1 / sum(d(i,j)/ d(k,j))^(2/(m-1)*/

privateMatrix solveU(Instances instances) {int n =instances.numInstances();inti, j;

Matrix U= newMatrix(m_NumClusters, n);for (i = 0; i < m_NumClusters; i++) {for (j = 0; j < n; j++) {double sum = 0;for (int k = 0; k < m_NumClusters; k++) {//d(i,j)/d(k,j)^(2/(m-1)

sum += Math.pow(D.get(i, j) / D.get(k, j), 2 /(getFuzzifier() - 1));

}

U.set(i, j, Math.pow(sum, -1));

}

}returnU;

}/**

* Calculates the distance between two instances

*

* @param first the first instance

* @param second the second instance

* @return the distance between the two given instances

* 计算两个实例之间的距离, 返回欧几里德距离*/

private doubledistance(Instance first, Instance second) {doubleval1;doubleval2;double dist = 0.0;for (int i = 0; i

val1=first.value(i);

val2=second.value(i);

dist+= (val1 - val2) * (val1 -val2);

}

dist=Math.sqrt(dist);returndist;

}/**

* Updates the minimum and maximum values for all the attributes

* based on a new instance.

*

* @param instance the new instance

* 更新所有属性最大最小值, 跟K均值里的函数一样*/

private voidupdateMinMax(Instance instance) {for (int j = 0;j < m_ClusterCentroids.numAttributes(); j++) {if (!instance.isMissing(j)) {if(Double.isNaN(m_Min[j])) {

m_Min[j]=instance.value(j);

m_Max[j]=instance.value(j);

}else{if (instance.value(j)

m_Min[j]=instance.value(j);

}else{if (instance.value(j) >m_Max[j]) {

m_Max[j]=instance.value(j);

}

}

}

}

}

}/**

* Returns the number of clusters.

*

* @return the number of clusters generated for a training dataset.

* @throws Exception if number of clusters could not be returned

* successfully

* 返回聚类个数*/

public intnumberOfClusters() throws Exception {returnm_NumClusters;

}/**

* 返回模糊算子, 即加权指数

*

* @return 加权指数

* @throws Exception 加权指数不能成功返回*/

public doublefuzzifier() throws Exception {returnm_fuzzifier;

}/**

* Returns an enumeration describing the available options.

*

* @return an enumeration of all the available options.

* 返回一个枚举描述的活动选项(菜单)*/

publicEnumeration listOptions () {

Vector result= newVector();

result.addElement(newOption("\tnumber of clusters.\n"

+ "\t(default 2).","N", 1, "-N "));

result.addElement(newOption("\texponent.\n"

+ "\t(default 2.0).","F", 1, "-F "));

Enumeration en=super.listOptions();while(en.hasMoreElements())

result.addElement(en.nextElement());returnresult.elements();

}/**

* Returns the tip text for this property

* @return tip text for this property suitable for

* displaying in the explorer/experimenter gui

* 返回文本信息*/

publicString numClustersTipText() {return "set number of clusters";

}/**

* set the number of clusters to generate

*

* @param n the number of clusters to generate

* @throws Exception if number of clusters is negative

* 设置聚类个数*/

public void setNumClusters(intn) throws Exception {if (n <= 0) {throw new Exception("Number of clusters must be > 0");

}

m_NumClusters=n;

}/**

* gets the number of clusters to generate

*

* @return the number of clusters to generate

* 取聚类个数*/

public intgetNumClusters() {returnm_NumClusters;

}/**

* Returns the tip text for this property

* @return tip text for this property suitable for

* displaying in the explorer/experimenter gui

* 返回文本信息*/

publicString fuzzifierTipText() {return "set fuzzifier";

}/**

* set the fuzzifier

*

* @param f fuzzifier

* @throws Exception if exponent is negative

* 设置模糊算子*/

public void setFuzzifier(doublef) throws Exception {if (f <= 1) {throw new Exception("F must be > 1");

}

m_fuzzifier=f;

}/**

* get the fuzzifier

*

* @return m_fuzzifier

* 取得模糊算子*/

public doublegetFuzzifier() {returnm_fuzzifier;

}/**

* Parses a given list of options.

*

* Valid options are:

*

*

 -N <num>

* number of clusters.

* (default 2).

*

*

 -F <num>

* fuzzifier.

* (default 2.0).

*

*

 -S <num>

* Random number seed.

* (default 10)

*

*

* @param options the list of options as an array of strings

* @throws Exception if an option is not supported

* 设置活动选项*/

public voidsetOptions (String[] options)

throws Exception {

String optionString= Utils.getOption('N', options);if (optionString.length() != 0) {

setNumClusters(Integer.parseInt(optionString));

}

optionString= Utils.getOption('F', options);if (optionString.length() != 0) {

setFuzzifier((newDouble(optionString)).doubleValue());

}

super.setOptions(options);

}/**

* Gets the current settings of FuzzyCMeans

*

* @return an array of strings suitable for passing to setOptions()

* 取得活动选项*/

publicString[] getOptions () {inti;

Vector result;

String[] options;

result= newVector();

result.add("-N");

result.add("" +getNumClusters());

result.add("-F");

result.add("" +getFuzzifier());

options=super.getOptions();for (i = 0; i < options.length; i++)

result.add(options[i]);return (String[]) result.toArray(newString[result.size()]);

}/**

* return a string describing this clusterer

*

* @return a description of the clusterer as a string

* 结果显示*/

publicString toString() {int maxWidth = 0;for (int i = 0; i < m_NumClusters; i++) {for (int j = 0 ;j < m_ClusterCentroids.numAttributes(); j++) {if(m_ClusterCentroids.attribute(j).isNumeric()) {double width = Math.log(Math.abs(m_ClusterCentroids.instance(i).value(j))) /Math.log(10.0);

width+= 1.0;if ((int)width >maxWidth) {

maxWidth= (int)width;

}

}

}

}

StringBuffer temp= newStringBuffer();

String naString= "N/A";for (int i = 0; i < maxWidth+2; i++) {

naString+= " ";

}

temp.append("\nFuzzy C-means\n======\n");

temp.append("\nNumber of iterations:" + m_Iterations+"\n");

temp.append("Within cluster sum of squared errors:" +Utils.sum(m_squaredErrors));

temp.append("\n\nCluster centroids:\n");for (int i = 0; i < m_NumClusters; i++) {

temp.append("\nCluster"+i+"\n\t");

temp.append("\n\tStd Devs:");for (int j = 0; j < m_ClusterStdDevs.numAttributes(); j++) {if(m_ClusterStdDevs.attribute(j).isNumeric()) {

temp.append(" "+Utils.doubleToString(m_ClusterStdDevs.instance(i).value(j),

maxWidth+5, 4));

}else{

temp.append(" "+naString);

}

}

}

temp.append("\n\n");returntemp.toString();

}/**

* Gets the the cluster centroids

*

* @return the cluster centroids

* 取得聚类中心*/

publicInstances getClusterCentroids() {returnm_ClusterCentroids;

}/**

* Gets the standard deviations of the numeric attributes in each cluster

*

* @return the standard deviations of the numeric attributes

* in each cluster

* 聚得标准差*/

publicInstances getClusterStandardDevs() {returnm_ClusterStdDevs;

}/**

* Returns for each cluster the frequency counts for the values of each

* nominal attribute

*

* @return the counts*/

public int[][][] getClusterNominalCounts() {returnm_ClusterNominalCounts;

}/**

* Gets the squared error for all clusters

*

* @return the squared error

* 取得平方差*/

public doublegetSquaredError() {returnUtils.sum(m_squaredErrors);

}/**

* Gets the number of instances in each cluster

*

* @return The number of instances in each cluster

* 取每个簇的实例个数*/

public int[] getClusterSizes() {returnm_ClusterSizes;

}/**

* Main method for testing this class.

*

* @param argv should contain the following arguments:

* -t training file [-N number of clusters]

* 主函数*/

public static voidmain (String[] argv) {

runClusterer(newFuzzyCMeans (), argv);

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值