weka是著名的数据挖掘工具,在这里有详细介绍,IDMer老师的博客里也有比较详细的用法描述。当然,如果直接使用weka的工具,自然没有问题,但是如果想用weka的功能在自己的平台框架中呢?我这里放出一个当初对weka的源码学习过程,主要是如何调用weka的api。仅供参考,代码中有什么问题,欢迎邮件联系。
这里简单讲解一下流程。构造方法首先载入一个arff文件,然后调用doCluster()方法进行聚类。本文用到的arff文件是weka的data目录里自带的标准数据集。主要格式如下:
1/**
2 *
3 */
4package edu.tju.ikse.mi.util;
5
6import java.io.File;
7import java.io.FileNotFoundException;
8import java.io.IOException;
9import java.io.PrintWriter;
10import java.util.Scanner;
11
12import edu.tju.ikse.mi.anno.util.CfUtil;
13
14
15import weka.clusterers.XMeans;
16import weka.core.Instances;
17import weka.core.converters.ArffLoader;
18
19/**
20 * @author Jia Yu
21 * @date 2010-5-28
22 */
23public class WekaCluster {
24
25 /**
26 * @param args
27 */
28
29 private ArffLoader loader;
30 private Instances dataSet;
31 private weka.clusterers.Clusterer cluster;
32 private int numOfClusters;
33 private String newAttribute;
34 private File arffFile;
35 private int sizeOfDataset;
36
37 public WekaCluster(File arffFile) {
38 this.arffFile = arffFile;
39 doCluster();
40 }
41
42 private void doCluster() {
43 loader = new ArffLoader();
44 newAttribute = "";
45 try {
46 loader.setFile(arffFile);
47 dataSet = loader.getDataSet();
48 cluster = new XMeans();
49 cluster.buildClusterer(dataSet);
50 numOfClusters = cluster.numberOfClusters();
51 StringBuilder sb = new StringBuilder();
52 for (int i = 0; i < numOfClusters; i++) {
53 sb.append("s" + (i + 1) + " ");
54 }
55 newAttribute = sb.toString().trim();
56 sizeOfDataset = dataSet.numInstances();
57 } catch (Exception e) {
58 e.printStackTrace();
59 }
60 }
61
62 public void newArffWriter() {
63 int lineNum = 0;
64 try {
65 Scanner input = new Scanner(arffFile);
66 PrintWriter out = new PrintWriter(CfUtil
67 .GetFileNameNoExtFromFileName(arffFile.getName())
68 + "_classification.arff");
69
70 while (input.hasNext()) {
71 String line = input.nextLine();
72 if (line.startsWith("@relation")) {
73 out.println("@relation" + line.substring(9)
74 + "_classification");
75 } else if (line.startsWith("@data")) {
76 out.println("@attribute shape {" + newAttribute + "}");
77 out.println("@data");
78 } else if (line.startsWith("@attribute")) {
79 out.println(line);
80 } else if (line.isEmpty()) {
81 out.println();
82 } else {
83 line += ",class"
84 + (cluster.clusterInstance(dataSet
85 .instance(lineNum)) + 1);
86 out.println(line);
87 lineNum++;
88 }
89 }
90 out.close();
91 } catch (FileNotFoundException e) {
92 e.printStackTrace();
93 } catch (Exception e) {
94 e.printStackTrace();
95 }
96 }
97
98 public int clusterNewInstance(weka.core.Instance instance) {
99 int indexOfCluster = -1;
100 try {
101 indexOfCluster = cluster.clusterInstance(instance);
102 //System.out.println("cluster " + indexOfCluster);
103 } catch (Exception e) {
104 e.printStackTrace();
105 }
106 return indexOfCluster;
107 }
108
109 public double[] frequencyOfCluster() {
110 int[] sum = new int[this.numOfClusters];
111 try {
112 for (int i = 0; i < this.sizeOfDataset; i++) {
113 sum[cluster.clusterInstance(dataSet.instance(i))]++;
114 }
115 } catch (Exception e) {
116 e.printStackTrace();
117 }
118 double[] fre = new double[sum.length];
119 for (int i = 0; i < sum.length; i++) {
120 fre[i] = (double)sum[i] / (double)this.sizeOfDataset;
121 }
122 return fre;
123 }
124
125 public static void main(String[] args) {
126 File file = new File("cpu.arff");
127 WekaCluster wc = new WekaCluster(file);
128 double[] fre = wc.frequencyOfCluster();
129 for(int i=0;i<fre.length;i++)
130 System.out.println(fre[i]);
131 // wc.newArffWriter(file);
132 double[] feature = { 125,256,6000,256,16,128,199 };
133 weka.core.Instance ins = new weka.core.Instance(7);
134 for (int i = 0; i < ins.numAttributes(); i++) {
135 ins.setValue(i, feature[i]);
136 // System.out.println(ins.attribute(i).getLowerNumericBound());
137 }
138 System.out.println("cluster in : "+wc.clusterNewInstance(ins));
139 }
140
141}
142
@relation ‘cpu’
@attribute MYCT real
@attribute MMIN real
@attribute MMAX real
@attribute CACH real
@attribute CHMIN real
@attribute CHMAX real
@attribute class real
@data
125,256,6000,256,16,128,199
29,8000,32000,32,8,32,253
29,8000,32000,32,8,32,253
这里摘取了3项。运行程序执行结果如下:
0.03827751196172249
0.16267942583732056
0.69377990430622
0.10526315789473684
cluster in : 0
表示聚类方法将数据集聚为四类,程序中提供的instance被聚到第一类里。每一类的在总文件中的比率如上显示。
具体的数据挖掘的内容就不在这里讲述了。只是为大家提供一个weka的java用法实现。方便在程序中使用weka。