先从j48.java开始,j48.java继承了抽象类Classifier.java,其中第一个方法buildClassfier()
public void buildClassifier(Instances instances)
throws Exception {
ModelSelection modSelection;
if (m_binarySplits)//判断是否是二分树,来设定模型
modSelection = new BinC45ModelSelection(m_minNumObj, instances);
else
modSelection = new C45ModelSelection(m_minNumObj, instances);
if (!m_reducedErrorPruning)//判断是否减少错误剪枝
m_root = new C45PruneableClassifierTree(modSelection, !m_unpruned, m_CF,
m_subtreeRaising, !m_noCleanup);//m_CF是置信度
else
m_root = new PruneableClassifierTree(modSelection, !m_unpruned, m_numFolds,
!m_noCleanup, m_Seed);
m_root.buildClassifier(instances);//建立分类器
if (m_binarySplits) {
((BinC45ModelSelection)modSelection).cleanup();
} else {
((C45ModelSelection)modSelection).cleanup();
}
}
怎样进行模型选择的后面来研究,先研究下如何建立分类树
展开C45PruneableClassifierTree.java中的buildClassifier
public void buildClassifier(Instances data) throws Exception {
// can classifier tree handle the data?
getCapabilities().testWithFail(data);//去除一些有错误的实例
// remove instances with missing class
data = new Instances(data);
data.deleteWithMissingClass();
buildTree(data, m_subtreeRaising || !m_cleanup);
collapse();
if (m_pruneTheTree) {
prune();
}
if (m_cleanup) {
cleanup(new Instances(data, 0));
}
}
发现了buildTree()方法是用来建树的
buildTree()是在基类ClassifierTree.java中,展开buildTree()
public void buildTree(Instances data, boolean keepData) throws Exception {
Instances [] localInstances;
if (keepData) {//判断是否持有数据is training data to be kept?
m_train = data;
}
m_test = null;//The pruning instances.
m_isLeaf = false;//是否是叶子
m_isEmpty = false;//是否为空
m_sons = null;//指向孩子结点的指针
m_localModel = m_toSelectModel.selectModel(data);//The model selection method.
if (m_localModel.numSubsets() > 1) {
localInstances = m_localModel.split(data);
data = null;
m_sons = new ClassifierTree [m_localModel.numSubsets()];
for (int i = 0; i < m_sons.length; i++) {
m_sons[i] = getNewTree(localInstances[i]);
localInstances[i] = null;
}
}else{
m_isLeaf = true;
if (Utils.eq(data.sumOfWeights(), 0))
m_isEmpty = true;
data = null;
}
}
调用 modSelection.selectModel(data);//选一个划分模型
modSelection.split(data). // 分割数据
m_sons[i] = getNewTree(localInstances[i]); // 构建子树
将 C45ModelSelection.selectModel(data) 继续展开:
public final ClassifierSplitModel selectModel(Instances data){
double minResult;
double currentResult;
C45Split [] currentModel;
C45Split bestModel = null;
NoSplit noSplitModel = null;
double averageInfoGain = 0;
int validModels = 0;
boolean multiVal = true;
Distribution checkDistribution;
Attribute attribute;
double sumOfWeights;
int i;
try{
// 检查是否所有实例都属于一个类或者有足够多的实例去分类
checkDistribution = new Distribution(data);
noSplitModel = new NoSplit(checkDistribution);
if (Utils.sm(checkDistribution.total(),2*m_minNoObj) ||
Utils.eq(checkDistribution.total(),
checkDistribution.perClass(checkDistribution.maxClass())))
return noSplitModel;
// 检查是否所有的属性都是名目属性并且足够多的取值
if (m_allData != null) {
Enumeration enu = data.enumerateAttributes();
while (enu.hasMoreElements()) {
attribute = (Attribute) enu.nextElement();
if ((attribute.isNumeric()) ||
(Utils.sm((double)attribute.numValues(),
(0.3*(double)m_allData.numInstances())))){//属性是否是数值属性或者属性取值个数是否少于比数据集中实例的3分之1少
multiVal = false; //不是多值
break;
}
}
}
currentModel = new C45Split[data.numAttributes()];//设置模型数组大小
sumOfWeights = data.sumOfWeights();
// 对每一个属性
for (i = 0; i < data.numAttributes(); i++){
// 除了类属性
if (i != (data).classIndex()){
// 从目前的属性获得模型并建立分类器去分类训练数据
currentModel[i] = new C45Split(i,m_minNoObj,sumOfWeights);//i是属性的标号
currentModel[i].buildClassifier(data);//建一个C45Split的分类器用i属性来对样本进行划分,从而得到信息增益和信息增益率
// 检查这个模型是否有用
if (currentModel[i].checkModel())
if (m_allData != null) {
if ((data.attribute(i).isNumeric()) ||
(multiVal || Utils.sm((double)data.attribute(i).numValues(),
(0.3*(double)m_allData.numInstances())))){
averageInfoGain = averageInfoGain+currentModel[i].infoGain();
validModels++;
}
} else {
averageInfoGain = averageInfoGain+currentModel[i].infoGain();
validModels++;
}
}else
currentModel[i] = null;//如果类属性就为null
}
// 检查是否每一个划分都被找到
if (validModels == 0)
return noSplitModel;
averageInfoGain = averageInfoGain/(double)validModels;
// 找到最好的划分
minResult = 0;
for (i=0;i<data.numAttributes();i++){
if ((i != (data).classIndex()) &&
(currentModel[i].checkModel()))
// Use 1E-3 here to get a closer approximation to the original
// implementation.
if ((currentModel[i].infoGain() >= (averageInfoGain-1E-3)) &&//信息增益与平均值做比较,比平均值小的,增益率再高也不做考虑
Utils.gr(currentModel[i].gainRatio(),minResult)){
bestModel = currentModel[i];
minResult = currentModel[i].gainRatio();
}
}
// 检查是否划分被找出
if (Utils.eq(minResult,0))
return noSplitModel;
// Add all Instances with unknown values for the corresponding
// attribute to the distribution for the model, so that
// the complete distribution is stored with the model.
bestModel.distribution().
addInstWithUnknown(data,bestModel.attIndex());
// 对于数值属性,要找到最大的分割点
if (m_allData != null)
bestModel.setSplitPoint(m_allData);
return bestModel;
}catch(Exception e){
e.printStackTrace();
}
return null;
}
关于Instance.java默认设置每个实例的权重为1
public Instance(int numAttributes) {
m_AttValues = new double[numAttributes];
for (int i = 0; i < m_AttValues.length; i++) {
m_AttValues[i] = MISSING_VALUE;
}
m_Weight = 1;
m_Dataset = null;
}