前段时间刚开始入门数据挖掘的时候,师兄需要我调用weka的经典算法,到时用于实验时做算法对比。由于当时自己Java没学多久,同时对这个开源的软件不熟悉,就上网找资料来学习调用的方法,但几乎找不到这类的资料。我这里就稍微补补“漏洞”吧(可能是我搜索技术差没找着 — — |||)。我的专业词汇懂得少,描述不准确的地方还请各位帮忙指出。同时有个别的调用出来的结果和weka的本地结果不太一致,也请大家多多指教,这篇文章更多是希望起到抛砖引玉的作用。
分类算法:
1.调用C4.5
分类算法我们会额外计算它的分类准确率,实现代码如下,有注释了,分类算法的实现很多共通之处,往后的几个 分类算法就不指明了:
必须提醒的是,设置分类属性所在行号是必须的。
import weka.classifiers.*;
import weka.core.Instances;
import weka.core.converters.*;
import weka.classifiers.trees.J48; // C45算法(1)
import java.io.File;
import java.io.IOException;
import javax.swing.*;
public class callC45
{
public callC45()
{}
public void Main() throws Exception
{
J48 m_classifier = new J48();
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.with.vendor.arff"); //训练语料文件
ArffLoader atf = new ArffLoader();
atf.setFile(inputFile);
Instances instancesTrain = atf.getDataSet(); //读入训练文件
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.with.vendor.arff"); // 测试语料文件
atf.setFile(inputFile);
Instances instancesTest = atf.getDataSet(); //读入训练文件
instancesTest.setClassIndex(0); //设置分类属性所在行号(第一行为0号),instancesTest.numAttributes()可以取得属性总数
double sum = instancesTest.numInstances(),right=0.0f; //测试语料实例数
instancesTrain.setClassIndex(0);
m_classifier.buildClassifier(instancesTrain);
System.out.println(m_classifier.toString());
System.out.println("");
for(int i = 0; i < sum ; i++)
{
if(m_classifier.classifyInstance(instancesTest.instance(i))==instancesTest.instance(i).classValue()) //如果预测值和答案值相等(测试语料中的分类列提供的须为正确答案,结果才有意义)
{
right++;
}
}
System.out.println("J48 classification precision:"+(right/sum));
}
// public static void main(String[] args) throws Exception
// {
// callC45 a = new callC45();
// a.Main();
// }
}
2.调用AdaBoost算法
import weka.classifiers.meta.AdaBoostM1;
import weka.classifiers.*;
import weka.core.converters.*;
import weka.core.Instances;
import java.io.*;
public class callAdaBoostM1
{
public callAdaBoostM1()
{
}
public void Main() throws Exception
{
AdaBoostM1 m_classifier = new AdaBoostM1();
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.with.vendor.arff");
ArffLoader arf = new ArffLoader();
arf.setFile(inputFile);
Instances instancesTrain = arf.getDataSet();
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.with.vendor.arff");
arf.setFile(inputFile);
Instances instancesTest = arf.getDataSet();
instancesTest.setClassIndex(0);
double sum = instancesTest.numInstances(),right=0.0f;
instancesTrain.setClassIndex(0);
m_classifier.buildClassifier(instancesTrain);
System.out.println(m_classifier.toString());
System.out.println("");
for(int i = 0; i<sum ; i++)
{
if(m_classifier.classifyInstance(instancesTest.instance(i)) == instancesTest.instance(i).classValue())
{
right++;
}
}
System.out.println("AdaBoostM1 classification precision:"+(right/sum));
}
public static void main(String[] args) throws Exception
{
callAdaBoostM1 a = new callAdaBoostM1();
a.Main();
}
}
3.调用朴素bayes算法;
import java.io.*;
import weka.classifiers.*;
import weka.classifiers.bayes.*;
import weka.core.Instances;
import weka.core.converters.*;
import java.io.File;
import java.io.IOException;
public class callbayes
{
public callbayes()
{}
public void Main() throws Exception
{
NaiveBayes m_classifier = new NaiveBayes(); //朴素贝叶斯的调用
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\labor.arff");
ArffLoader atf = new ArffLoader();
atf.setFile(inputFile);
Instances instancesTrain = atf.getDataSet();
instancesTrain.setClassIndex(instancesTrain.numAttributes()-1);
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\labor.arff");
atf.setFile(inputFile);
Instances instancesTest = atf.getDataSet();
instancesTest.setClassIndex(instancesTest.numAttributes()-1);
double sum = instancesTest.numInstances(),right = 0.0f;
m_classifier.buildClassifier(instancesTrain);
System.out.println(m_classifier.toString());
System.out.println("");
for(int i = 0; i < sum ; i++)
{
if(m_classifier.classifyInstance(instancesTest.instance(i)) == instancesTest.instance(i).classValue())
{
right++;
}
}
System.out.println("Navisbayes classification precision:" + (right/sum));
}
// public static void main(String[] args) throws Exception
// {
// callbayes a = new callbayes();
// a.Main();
// }
}
4.调用KNN算法:
import weka.classifiers.*;
import weka.classifiers.lazy.IBk;
import weka.core.converters.*;
import weka.core.Instances;
import java.io.*;
public class callKNN
{
public callKNN()
{}
public void Main() throws Exception
{
IBk m_classifier = new IBk(3);
// int k=3;
// Classifier m_classifier = new IBk(k);
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.with.vendor.arff");
ArffLoader arf = new ArffLoader();
arf.setFile(inputFile);
Instances instancesTrain = arf.getDataSet();
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.with.vendor.arff");
arf.setFile(inputFile);
Instances instancesTest = arf.getDataSet();
instancesTest.setClassIndex(0);
double sum = instancesTest.numInstances(),right=0.0f;
instancesTrain.setClassIndex(0);
m_classifier.buildClassifier(instancesTrain);
System.out.println("The k is : "+m_classifier.getKNN());
System.out.println("");
System.out.println(m_classifier.toString());
System.out.println("");
for(int i = 0; i<sum ; i++)
{
if(m_classifier.classifyInstance(instancesTest.instance(i)) == instancesTest.instance(i).classValue())
{
right++;
}
}
System.out.println("KNN classification precision:"+(right/sum));
}
public static void main(String[] args) throws Exception
{
callKNN a = new callKNN();
a.Main();
}
}
5.调用ID3(CART) 算法
import weka.classifiers.*;
import weka.classifiers.trees.Id3;
import weka.core.Instances;
import weka.core.converters.*;
import java.io.*;
public class callId3
{
public callId3()
{}
public void Main() throws Exception
{
Id3 m_classifier = new Id3();
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\contact-lenses.arff");
ArffLoader atf = new ArffLoader();
atf.setFile(inputFile);
Instances instancesTrain = atf.getDataSet();
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\contact-lenses.arff");
atf.setFile(inputFile);
Instances instancesTest = atf.getDataSet();
instancesTest.setClassIndex(0);
double sum = instancesTest.numInstances(),right=0.0f;
instancesTrain.setClassIndex(0);
m_classifier.buildClassifier(instancesTrain);
System.out.println(m_classifier.toString());
System.out.println("");
for(int i = 0; i < sum ; i++)
{
if(m_classifier.classifyInstance(instancesTest.instance(i)) == instancesTest.instance(i).classValue())
{
right++;
}
}
System.out.println("Id3 classification precision:"+(right/sum));
}
// public static void main(String[] args) throws Exception
//{
// callId3 a = new callId3();
// a.Main();
// }
}
聚类算法:
主要是调用了EM算法和KM算法,SVM由于还不会用libsvm,就先放着了,聚类算法的调用的不同分类算法之处就是不必设置分类属性行号:
1.调用EM算法;
import weka.clusterers.EM;
import weka.core.Instances;
import weka.core.converters.*;
import weka.clusterers.*;
import java.io.*;
public class callEM
{
public callEM()
{
}
public void Main() throws Exception
{
EM m_cluster = new EM();
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.arff");
ArffLoader arf = new ArffLoader();
arf.setFile(inputFile);
Instances instancesTrain = arf.getDataSet();
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.arff");
arf.setFile(inputFile);
Instances instancesTest = arf.getDataSet();
m_cluster.buildClusterer(instancesTrain);
System.out.println("The number of cluster : "+m_cluster.numberOfClusters());
int num = m_cluster.numberOfClusters();
System.out.println("");
System.out.println(m_cluster.toString());
System.out.println("");
double[] predict = m_cluster.clusterPriors();
for(int i = 0; i<num ; i++)
{
System.out.println("第 "+i+" 个 聚类的先验为 : "+predict[i]);
}
}
public static void main(String[] args) throws Exception
{
callEM a = new callEM();
a.Main();
}
}
2.调用KM算法;
import weka.clusterers.*;
import weka.core.converters.*;
import weka.core.Instances;
import java.io.*;
public class callKM
{
public callKM()
{
}
public void Main() throws Exception
{
SimpleKMeans m_cluster = new SimpleKMeans();
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.arff");
ArffLoader arf = new ArffLoader();
arf.setFile(inputFile);
Instances instancesTrain = arf.getDataSet();
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\cpu.arff");
arf.setFile(inputFile);
Instances instancesTest = arf.getDataSet();
m_cluster.buildClusterer(instancesTrain);
System.out.println("The number of cluster : "+m_cluster.numberOfClusters());
int num = m_cluster.numberOfClusters();
System.out.println("");
System.out.println(m_cluster.toString());
System.out.println("");
int[] size = m_cluster.getClusterSizes();
int sum = 0;
for(int i = 0; i<num ; i++)
{
sum += size[i];
}
for(int i = 0; i<num ; i++)
{
System.out.println("第 "+i+" 个 聚类的大小为 : "+size[i]+" 所占比例为 : "+(double)size[i]/(double)sum);
}
}
public static void main(String[] args) throws Exception
{
callKM a = new callKM();
a.Main();
}
}
关联规则算法
只是调用了Apriori算法,调用方法和聚类算法的基本一样了。。。
1.调用Apriori算法
import weka.associations.*;
import weka.core.converters.*;
import weka.core.Instances;
import java.io.*;
public class callApriori
{
public callApriori()
{
}
public void Main() throws Exception
{
Apriori m_association = new Apriori();
File inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\contact-lenses.arff");
ArffLoader arf = new ArffLoader();
arf.setFile(inputFile);
Instances instancesTrain = arf.getDataSet();
inputFile = new File("E:\\资料\\数据挖掘\\weka-3-5-8\\data\\contact-lenses.arff");
arf.setFile(inputFile);
Instances instancesTest = arf.getDataSet();
m_association.buildAssociations(instancesTrain);
System.out.println("The Number of Rules : "+m_association.getNumRules());
System.out.println(m_association.toString());
System.out.println("");
}
// public static void main(String[] args) throws Exception
// {
// callApriori a = new callApriori();
// a.Main();
// }
}
最后要说明的一点是调用算法的方法很多,需要调用的函数应“量身定调用”,切忌千篇一律~~