1、界面
2、精度评价
3、输出规则集
4、部分源码
using System;
using System.Collections.Generic;
using System.Text;
using System.Collections;
using System.Text.RegularExpressions;
namespace BoostTree
{
class TreeClass : PublicClass
{
protected Tree TreeRun(ArrayList buffer, int treeHigh, int[] contribute)//构树主函数
{
if (treeHigh == 0)
treeHigh = 7;
int depth = treeHigh;//控制树的深度
Tree treeNode = new Tree();//定义树结构
ArrayList bandIDlist = new ArrayList();//存储属性索引
int classID = ((String[])buffer[0]).Length - 1;
for (int i = 0; i < classID; i++) bandIDlist.Add(i);//属性索引2,3,4,5,6,7,8,9,.....rNum-1
builtTree(buffer, bandIDlist, treeNode, depth,classID,contribute);//递归构树
return treeNode;
}
private void builtTree(ArrayList buffer, ArrayList bandIDlist, Tree fatherTree, int depth, int classID, int[] contribute)//递归主函数 在分裂结束条件上有待进一步验证,这个环节等待数的存储搞定之后
{
//bool yy = panduanData();
//if (yy)
{
Double[] bandDiscrete = new Double[bandIDlist.Count];//存储离散化后的取值范围
Double[] Ratio = new Double[bandIDlist.Count];
bandDiscrete = discrelizeALL(buffer, bandIDlist, Ratio, classID);
ArrayList classValue = getclassValue(buffer, classID);//获取类的取值
ArrayList bestbandID = new ArrayList();
if (classValue.Count <= 1)
{
fatherTree.PreClass = setPreClass(buffer, classValue, classID);
///叶子节点的样本保留
fatherTree.leafSample = buffer;
fatherTree.Cover = Convert.ToDouble(buffer.Count);
fatherTree.Miss = getMiss(buffer, fatherTree.PreClass, classID);
}
else
{
bestbandID = findbestbandID(Ratio, bandIDlist, bandIDlist.Count);
if (bestbandID == null)//ES-ESA=0
{
fatherTree.PreClass = setPreClass(buffer, classValue, classID);
//叶子节点的样本保留
fatherTree.leafSample = buffer;
fatherTree.Cover = Convert.ToDouble(buffer.Count);
fatherTree.Miss = getMiss(buffer, fatherTree.PreClass, classID);
}
else
{
int k = 0;//预留两个第一名接口
depth--;
Double attributeValue = bandDiscrete[bandIDlist.IndexOf((int)bestbandID[0])];//
fatherTree.CutBandID = (int)bestbandID[k];
///贡献度计算
int featureNum = (int)bestbandID[0];
contribute[featureNum]++;
Tree leftChild = new Tree();
Tree rightChild = new Tree();
fatherTree.DivisionValue = attributeValue;
ArrayList leftBuffer = new ArrayList();
ArrayList rightBuffer = new ArrayList();
for (int i = 0; i < buffer.Count; i++)
{
if (Convert.ToDouble(((string[])buffer[i])[(int)bestbandID[k]]) <= attributeValue)
{
leftBuffer.Add(buffer[i]);
}
else
{
rightBuffer.Add(buffer[i]);
}
}
if (depth > 0)
{
builtTree(leftBuffer, bandIDlist, leftChild, depth, classID, contribute);
builtTree(rightBuffer, bandIDlist, rightChild, depth, classID, contribute);
}
else
{
leftChild.PreClass = setPreClass(leftBuffer, classValue, classID);
//叶子节点的样本保留
leftChild.leafSample = leftBuffer;
leftChild.Cover = Convert.ToDouble(leftBuffer.Count);
leftChild.Miss = getMiss(leftBuffer, leftChild.PreClass, classID);
rightChild.PreClass = setPreClass(rightBuffer, classValue, classID);
//叶子节点的样本保留
rightChild.leafSample = rightBuffer;
rightChild.Cover = Convert.ToDouble(rightBuffer.Count);
rightChild.Miss = getMiss(rightBuffer, rightChild.PreClass, classID);
}
fatherTree.LeftChild = leftChild;
fatherTree.LeftChild.Parent = fatherTree;
fatherTree.RightChild = rightChild;
fatherTree.RightChild.Parent = fatherTree;
}
}
}
}
private Double[] discrelizeALL(ArrayList buffer, ArrayList bandIDlist, double[] Ratio,int classID)//连续属性离散化主函数
{
Double[] bandDiscrete = new Double[bandIDlist.Count];
for (int i = 0; i < bandIDlist.Count; i++)
{
double[] best = new double[1];
bandDiscrete[i] = discrelize(buffer, (int)bandIDlist[i], best, classID);//离散化
Ratio[i] = best[0];
}
return bandDiscrete;
}
protected ArrayList getclassValue(ArrayList buffer, int classID)//获取样本中所有类别号(通过验证)
{
ArrayList classValue = new ArrayList();//存储不同类别号
string value = null;
for (int i = 0; i < buffer.Count; i++)
{
bool yn = true;//是否为新的类
for (int j = 0; j < classValue.Count; j++)
{
if (((string[])buffer[i])[classID] == ((string)classValue[j]))
yn = false;
}
if (yn)
{
value = ((string[])buffer[i])[classID];
classValue.Add(value);
}
}
return classValue;
}
protected string setPreClass(ArrayList Buffer, ArrayList classValue, int classID)//当节点为树叶时,找到样本数最多的类别
{
string PreClass;
double[] sumWeitht = new double[classValue.Count];
for (int i = 0; i < Buffer.Count; i++)
{
for (int j = 0; j < classValue.Count; j++)
if (((string[])(Buffer[i]))[classID] == (string)classValue[j])
sumWeitht[j]++;
}
int max = 0;
for (int j = 0; j < classValue.Count; j++)
{
if (sumWeitht[j] > sumWeitht[max])
max = j;
}
PreClass = (string)classValue[max];
return PreClass;
}
private ArrayList findbestbandID(Double[] GainRatio, ArrayList bandIDlist, int Num)//找出信息增益比率最高的属性索引号,这里可能会有多个(此函数被两个地方条用,且计算内容不同)
{
ArrayList bestbandID = new ArrayList();
ArrayList bestGainRatioID = new ArrayList();
bestGainRatioID.Add(0);
for (int i = 1; i < Num; i++)
{
if ((double)GainRatio[i] == (double)GainRatio[(int)bestGainRatioID[0]])//
{
bestGainRatioID.Add(i);
}
else
{
if ((double)GainRatio[i] > (double)GainRatio[(int)bestGainRatioID[0]])
{
bestGainRatioID.Clear(); //如果大于清空数组
bestGainRatioID.Add(i);
}
}
}
if ((double)GainRatio[(int)bestGainRatioID[0]] == 0.0)//最好的属性信息增益比率为0,证明现有的所有属性各自的取值唯一,达到停止分裂的条件
return null;
else
{
//如果有多个属性的信息增益比率相同怎么办?
//改随机赋予2012年12月21日,张晓贺
Random ran = new Random();
int RandKey = ran.Next(0, bestGainRatioID.Count-1);
//bestbandID.Add(bandIDlist[(int)bestGainRatioID[0]]);
bestbandID.Add(bandIDlist[(int)bestGainRatioID[RandKey]]);
}
return bestbandID;
}
private Double discrelize(ArrayList buffer, int bandID, double[] best, int classID)//确定候选断点
{
///分裂值
double divisionValue = 0.0;
//int depth = 1;//控制离散的深度,这里对应树的节点最多有2的depth次方棵子树
ArrayList bandDiscrelize = new ArrayList();
ArrayList bandWait = new ArrayList();
ArrayList attributeSort = new ArrayList();
ArrayList classValue = getclassValue(buffer, classID);//获取类的取值
for (int i = 0; i < buffer.Count; i++)
{
double value = Convert.ToDouble(((string[])buffer[i])[bandID]);//
attributeSort.Add(value);
}
attributeSort.Sort();
for (int i = 0; i < attributeSort.Count - 1; i++)//找出候选断点,即类别的中间值
{
double c = 0.0;
double a=(double)attributeSort[i];
double b=(double)attributeSort[i+1];
if (a!=b)//
{
c = (a + b) / 2.0;
bandWait.Add(c);
}
}
bandWait.Sort();
if (bandWait.Count != 0)//对有些不符合离散化条件的属性不做处理
{
divisionValue = getcutPoint(buffer, classValue, bandWait, bandID, best, classID);
}
return divisionValue;
}
private Double getcutPoint(ArrayList buffer, ArrayList classValue, ArrayList bandWait, int bandID, double[] best,int classID)//离散化递归函数
{
ArrayList GainRatio = new ArrayList();
ArrayList bestValue;//最好的候选断点
Double cutBest;
for (int i = 0; i < bandWait.Count; i++)
{
double cutValue = Convert.ToDouble(bandWait[i]);
GainRatio.Add(calculateGainRatio(buffer, cutValue, classValue, bandID,classID));//计算每个候选断点的信息增益
}
bestValue = findbestValue(GainRatio, bandWait);
cutBest = Convert.ToDouble(bestValue[0]);
double max = 0.0;
for (int j = 0; j < bandWait.Count; j++)
{
if (((double)(GainRatio[j])) > max)
{
max = ((double)(GainRatio[j]));
}
}
best[0] = max;
return cutBest;
}
private ArrayList findbestValue(ArrayList Gain, ArrayList bandWait)//找出信息增益比率最高的属性索引号
{
ArrayList bestValue = new ArrayList();
ArrayList bestGainID = new ArrayList();
bestGainID.Add(0);
for (int i = 1; i < Gain.Count; i++)
{
if ((double)Gain[i] == (double)Gain[(int)bestGainID[0]])//
{
bestGainID.Add(i);
}
else
{
if ((double)Gain[i] > (double)Gain[(int)bestGainID[0]])
{
bestGainID.Clear(); //如果大于清空数组
bestGainID.Add(i);
}
}
}
if ((double)Gain[(int)bestGainID[0]] == 0.0)//最好的属性信息增益为0
bestValue.Add(0.0);
else
{
// for (int i = 0; i < bestGainRatioID.Count; i++)//如果有多个属性的信息增益相同怎么办?
bestValue.Add(bandWait[(int)bestGainID[0]]);
// output.Add(bandIDlist[(int)bestGainRatioID[0]]);//测试用
}
return bestValue;
}
private double calculateGainRatio(ArrayList buffer, Double bandDiscrete, ArrayList classValue, int bandID,int classID)//递归子函数,计算GainRatio
{
double Gain = calculateGain(buffer, classValue, bandDiscrete, bandID,classID);
double SplitI = calculateSplitI(buffer, bandDiscrete, bandID);
if (SplitI != 0.0)//某划分中可能有某个属性的值唯一
return (Gain / SplitI);
else
return 0.0;
}
private double calculateGain(ArrayList buffer, ArrayList classValue, Double attributeValue, int bandID,int classID)//
{
double ESA = 0.0;
double ES = calculateES(buffer, classValue, classID);
//分为左右两段
ArrayList leftBuffer = new ArrayList();
ArrayList rightBuffer = new ArrayList();
for (int j = 0; j < buffer.Count; j++)
{
if (Convert.ToDouble(((string[])buffer[j])[bandID]) <= attributeValue)
{
leftBuffer.Add(buffer[j]);
}
else
{
rightBuffer.Add(buffer[j]);
}
}
double enL = 0.0;
double enR = 0.0;
enL = (calculateES(leftBuffer, classValue, classID)) * (((double)leftBuffer.Count) / ((double)buffer.Count));
enR = (calculateES(rightBuffer, classValue, classID)) * (((double)rightBuffer.Count) / ((double)buffer.Count));
ESA = enL + enR;
double Gain = ES - ESA;
return Gain;
}
private double calculateSplitI(ArrayList buffer, Double attributeValue, int index)//递归子函数,计算SplitI
{
double SplitI = 0.0;
double[] cNumW = new double[2];//定义数组存储每个类的实例个数
for (int j = 0; j < 2; j++) cNumW[j] = 0;//给数组赋初值 待验证是否需要
//计算某属性所有取值出现的次数
for (int i = 0; i < buffer.Count; i++)
{
if (Convert.ToDouble(((string[])buffer[i])[index]) <= attributeValue)
cNumW[0]++;
else
cNumW[1]++;
}
for (int j = 0; j < 2; j++)
{
if (cNumW[j] != 0)
{
double pi = cNumW[j] / ((double)buffer.Count);
SplitI += Math.Log(pi) / Math.Log(2.0) * pi;
}
}
return -SplitI;
}
private double calculateES(ArrayList buffer, ArrayList attributeValue, int index)//递归子函数,计算E(S)(验证通过)
{
double es = 0.0;
double[] cNumW = new double[attributeValue.Count];//定义数组存储每个类的实例个数
for (int j = 0; j < attributeValue.Count; j++) cNumW[j] = 0.0;//给数组赋初值
//计算某个类别的实例个数
for (int i = 0; i < buffer.Count; i++)
for (int j = 0; j < attributeValue.Count; j++)
{
if (((string[])buffer[i])[index] == ((string)attributeValue[j]))
cNumW[j]++;
}
for (int j = 0; j < attributeValue.Count; j++)
{
if (cNumW[j] != 0)
{
double pi = cNumW[j] / ((double)buffer.Count);
es += Math.Log(pi) / Math.Log(2.0) * pi;
}
}
return -es;
}
}
}
5、完整个代码:https://download.csdn.net/download/sailingw/14919831