WEKA学习笔记

最新推荐文章于 2022-06-13 14:50:04 发布

wintersense

最新推荐文章于 2022-06-13 14:50:04 发布

阅读量1.3k

点赞数

本文链接：https://blog.csdn.net/wintersense/article/details/41624671

版权

最近项目要用机器学习算法，老师推荐我使用WEKA做。于是又重新捡起java，开始WEKA的学习。下面是WEKA中EM的一个例子

import weka.clusterers.ClusterEvaluation;
import weka.clusterers.EM;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;
 
/**
 *EM是一种基于模型的聚类算法，假设样本符合高斯混合模型,算法的目的是确定各个高斯部件之间的参数，充分拟合给定数据，
 *并得到一个模糊聚类，即每个样本以不同概率属于每个高斯分布，概率数值将由以上个参数获得。
 * http://irwenqiang.iteye.com/blog/1601902
 */
public class ClassesToClusters {
  public static void main(String[] args) throws Exception {
    // load data
    Instances data = DataSource.read("C:\\Program Files\\Weka-3-7\\data\\breast-cancer.arff");
    data.setClassIndex(data.numAttributes() - 1);
 
    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (data.classIndex() + 1));
    filter.setInputFormat(data);
    Instances dataClusterer = Filter.useFilter(data, filter);
 
    // train clusterer
    EM clusterer = new EM();
    // set further options for EM, if necessary...
    clusterer.buildClusterer(dataClusterer);
 
    // evaluate clusterer
    ClusterEvaluation eval = new ClusterEvaluation();
    eval.setClusterer(clusterer);
    eval.evaluateClusterer(data);
 
    // print results
    System.out.println(eval.clusterResultsToString());
  }
}

运行结果：

///

EM
==

Number of clusters selected by cross validation: 3
Number of iterations performed: 82

Cluster
Attribute 0 1 2
(0.41) (0.23) (0.37)
=========================================
age
10-19 1 1 1
20-29 1.9894 1.0066 1.004
30-39 25.8716 11.6952 1.4332
40-49 66.7647 21.7491 4.4862
50-59 25.8874 24.5642 48.5484
60-69 1.039 10.736 48.225
70-79 1.0231 1.007 6.97
80-89 1 1 1
90-99 1 1 1
[total] 125.5752 73.7581 113.6667
menopause
lt40 1.9215 1.0119 7.0666
ge40 3.2754 30.0609 98.6637
premeno 114.3783 36.6853 1.9364
[total] 119.5752 67.7581 107.6667
tumor-size
0-4 5.0406 1.0038 4.9556
5-9 3.0222 1.0045 2.9734
10-14 14.8642 2.0109 14.1249
15-19 11.7759 4.3034 16.9207
20-24 21.0871 11.3648 20.548
25-29 29.3577 12.2918 15.3504
30-34 21.1749 22.1385 19.6865
35-39 8.1272 8.3987 5.4741
40-44 8.0648 8.3031 8.6321
45-49 1.9784 2.0065 2.0151
50-54 3.0822 2.932 4.9857
55-59 1 1 1
[total] 128.5752 76.7581 116.6667
inv-nodes
0-2 109.9892 8.4901 97.5207
3-5 6.4095 25.6041 6.9864
6-8 2.2449 16.6243 1.1307
9-11 1.2779 9.2861 2.436
12-14 1.6263 3.3732 1.0005
15-17 1.0274 6.3944 1.5782
18-20 1 1 1
21-23 1 1 1
24-26 1 1.9859 1.0141
27-29 1 1 1
30-32 1 1 1
33-35 1 1 1
36-39 1 1 1
[total] 129.5752 77.7581 117.6667
node-caps
yes 2.8887 53.3958 2.7155
no 115.6865 13.3622 103.9512
[total] 118.5752 66.7581 106.6667
deg-malig
1 33.9859 1.0677 38.9464
2 64.4173 28.4472 40.1355
3 21.1721 38.2431 28.5848
[total] 119.5752 67.7581 107.6667
breast
left 62.467 35.0889 57.444
right 56.1082 31.6691 49.2227
[total] 118.5752 66.7581 106.6667
breast-quad
left_up 35.6186 21.7116 42.6698
left_low 46.0633 27.4039 40.5328
right_up 14.2991 10.6849 11.016
right_low 15.6537 6.4464 4.8999
central 9.9407 3.5112 10.5481
[total] 121.5752 69.7581 109.6667
irradiat
yes 18.9252 37.0251 15.0497
no 99.65 29.733 91.617
[total] 118.5752 66.7581 106.6667
Clustered Instances

0 117 ( 41%)
1 62 ( 22%)
2 107 ( 37%)

Log likelihood: -8.81408

Class attribute: Class
Classes to Clusters:

0 1 2 <-- assigned to cluster
88 27 86 | no-recurrence-events
29 35 21 | recurrence-events

Cluster 0 <-- no-recurrence-events
Cluster 1 <-- recurrence-events
Cluster 2 <-- No class

Incorrectly clustered instances : 163.056.993 %
///

下面的小程序说明了如何取出arff文件中的data数据。

import java.io.FileReader;
import weka.core.Instances;

/**
 * desc:试试Weka中最基本最重要的Instance类
 * <code>InstanceTest</code>
 * @version 1.0 2011/12/13
 * @author chenwq
 *
 */
public class InstanceTest {
	public static Instances getFileInstances(String fileName) throws Exception {
		/*Java 字符流实现文件读写操作（FileReader-FileWriter）
			备注：字符流效率高，但是没有字节流底层*/
		FileReader frData = new FileReader(fileName);
		Instances data = new Instances(frData);

		return data;
	}

	public static void main(String[] args) throws Exception {
		Instances instances = getFileInstances("C:\\Program Files\\Weka-3-7\\data\\breast-cancer.arff");

		// 把数据集全部输入出
		 //System.out.println( instances );
		 
		// 用numInstances可以获得数据集中有多少样本
		for (int i = 0; i < instances.numInstances(); i++) {

			// instance( i )是得到第i个样本
			System.out.println(instances.instance(i));
		}
	}
}

/**
 * desc:试试Weka的决策树类
 * <code>J48Test</code>
 * @version 1.0 2011/12/13
 * @author chenwq
 *
 */
import java.io.File;
import java.io.IOException;

import weka.classifiers.Classifier;
import weka.classifiers.trees.J48;
import weka.core.Instances;
import weka.core.converters.ArffLoader;

public class J48Test {
	/**
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		 Classifier m_classifier = new J48();
	        File inputFile = new File("C:\\Program Files\\Weka-3-7\\data\\breast-cancer.arff");//训练语料文件
	        ArffLoader atf = new ArffLoader(); 
	        atf.setFile(inputFile);
	        Instances instancesTrain = atf.getDataSet(); // 读入训练文件    
	        inputFile = new File("C:\\Program Files\\Weka-3-7\\data\\breast-cancer.arff");//测试语料文件
	        atf.setFile(inputFile);          
	        Instances instancesTest = atf.getDataSet(); // 读入测试文件
	        instancesTest.setClassIndex(0); //设置分类属性所在行号（第一行为0号），instancesTest.numAttributes()可以取得属性总数
	        double sum = instancesTest.numInstances(),//测试语料实例数
	        right = 0.0f;
	        instancesTrain.setClassIndex(0);
	        m_classifier.buildClassifier(instancesTrain); //训练           
	        for(int  i = 0;i<sum;i++)//测试分类结果
	        {
	            if(m_classifier.classifyInstance(instancesTest.instance(i))==instancesTest.instance(i).classValue())//如果预测值和答案值相等（测试语料中的分类列提供的须为正确答案，结果才有意义）
	            {
	              right++;//正确值加1
	            }
	        }
	        System.out.println("J48 classification precision:"+(right/sum));
	}
}

（未完待续...）