1、weka数据集基本概念
weka.core.Instances-拥有一个完整的数据集。这个数据结构是基于行的;单一的行可以通过从0开始索引的instance(int) 方法获得。有关列的信息可以通过attribute(int)方法获得。此方法返回 weka.core.Attribute对象。
weka.core.Instance。-封装了一个单行。它基本上是一个双原语数组的包装。因为这个类不包含关于列的类型的任何信息,它总是需要访问weka.core.Instances对象。
weka.core.Attribute -拥有数据集中单个列的类型信息。它存储属性的类型,以及标称属性的标签,字符串属性可能的值或关系属性的数据集(这些也都是weka.core.Instances对象)。
2、手动构建weka数据输入Instances
public Instances produceData(){
Attribute length = new Attribute("length");
Attribute weight = new Attribute("weight");
// Create vector to hold nominal values "first", "second", "third"
FastVector my_nominal_values = new FastVector(3);
my_nominal_values.addElement("first");
my_nominal_values.addElement("second");
my_nominal_values.addElement("third");
// Create nominal attribute "position"
Attribute position = new Attribute("position", my_nominal_values);
// Create vector of the above attributes
FastVector attributes = new FastVector(3);
attributes.addElement(length);
attributes.addElement(weight);
attributes.addElement(position);
// Create the empty dataset "race" with above attributes
Instances race = new Instances("race", attributes, 0);
// Make position the class attribute
race.setClassIndex(position.index());
// Create empty instance with three attribute values
Instance inst = new Instance(3);
// Set instance's values for the attributes "length", "weight", and "position"
inst.setValue(length, 5.3);
inst.setValue(weight, 300);
inst.setValue(position, "first");
// Set instance's dataset to be the dataset "race"
inst.setDataset(race);
race.add(inst);
return race;
}
3、对输入的数据集Instances进行特征选择
private ArffLoader loader;
private Instances dataSet;
private File arffFile;
private int sizeOfDataset;
private int numOfOldAttributes;
private int numOfNewAttributes;
private int classIndex;
private int[] selectedAttributes;
public WekaSelector(File file) throws IOException {
loader = new ArffLoader();
arffFile = file;
loader.setFile(arffFile);
dataSet = loader.getDataSet();
sizeOfDataset = dataSet.numInstances();
numOfOldAttributes = dataSet.numAttributes();
classIndex = numOfOldAttributes - 1;
dataSet.setClassIndex(classIndex);
}
public void select() throws Exception {
ASEvaluation evaluator = new CfsSubsetEval();
ASSearch search = new BestFirst();
AttributeSelection eval = null;
eval = new AttributeSelection();
eval.setEvaluator(evaluator);
eval.setSearch(search);
Instances data = produceData();
//eval.SelectAttributes(dataSet);
eval.SelectAttributes(data);
numOfNewAttributes = eval.numberAttributesSelected();
selectedAttributes = eval.selectedAttributes();
System.out.println("result is "+eval.toResultsString());
// System.out.println("old number of Attributes is "+numOfOldAttributes);
System.out.println("new number of Attributes is "+numOfNewAttributes);
for(int i=0;i<selectedAttributes.length;i++){
System.out.println(selectedAttributes[i]);
}
}