R语言学习系列(数据挖掘之决策树算法实现--ID3代码篇)

转载自:http://blog.csdn.net/hawksoft/article/details/7760868

1、辅助类,用于计算过程和结果存储

[csharp]  view plain copy print ?
  1. /// <summary>  
  2.     /// 决策树节点.  
  3.     /// </summary>  
  4.     public class DecisionTreeNode  
  5.     {  
  6.         /// <summary>  
  7.         /// 类型:分支或叶子  
  8.         /// </summary>  
  9.         public string Type { getset; }  
  10.         /// <summary>  
  11.         /// 关键字一般存当前属性因子  
  12.         /// </summary>  
  13.         public string Key { getset; }  
  14.         /// <summary>  
  15.         /// 判断值,叶子节点有效.  
  16.         /// </summary>  
  17.         public string DecisionValue { getset; }  
  18.         /// <summary>  
  19.         /// 前一个属性因子,可以看作是分支条件.  
  20.         /// </summary>  
  21.         public string ParentFactor { getset; }  
  22.         /// <summary>  
  23.         /// 当前节点的样本数量,  
  24.         /// </summary>  
  25.         public int CalcCount { getset; }  
  26.         /// <summary>  
  27.         /// 当前节点的样本索引集合.  
  28.         /// </summary>  
  29.         public List<int> DataIndexes {get;set;}  
  30.         /// <summary>  
  31.         /// 分支节点集合.  
  32.         /// </summary>  
  33.         public Dictionary<string, DecisionTreeNode> Children { getprivate set; }  
  34.         /// <summary>  
  35.         /// 父节点  
  36.         /// </summary>  
  37.         public DecisionTreeNode Parent { getset; }  
  38.         public DecisionTreeNode()  
  39.         {  
  40.             DataIndexes = new List<int>();  
  41.             Children = new Dictionary<string, DecisionTreeNode>();  
  42.         }  
  43.          
  44.     }  
  45.     /// <summary>  
  46.     /// 用于计算过程存放数据.用数组不是很方便,这里采用字典,可以减少循环次数.  
  47.     /// </summary>  
  48.     public class CalcNode  
  49.     {  
  50.         public string Key { getset; }  
  51.         public string Type { getset; }  
  52.         public int CalcCount { getset; }  
  53.         public List<int> DataIndexes {get;set;}  
  54.         public Dictionary<string, CalcNode> Children { getprivate set; }  
  55.         public CalcNode()  
  56.         {  
  57.             DataIndexes = new List<int>();  
  58.             Children = new Dictionary<string, CalcNode>();  
  59.         }  
  60.         public void AddChildren(string Key,string AType,int AIndex, int Count = 1)  
  61.         {  
  62.             if (Children.ContainsKey(Key) == false)  
  63.             {  
  64.                 Children.Add(Key, new CalcNode());  
  65.             }  
  66.             Children[Key].Key = Key;  
  67.             Children[Key].Type = AType;  
  68.             Children[Key].CalcCount += Count;  
  69.             Children[Key].DataIndexes.Add(AIndex);  
  70.         }  
  71.         
  72.     }  


2、算法类,注释比较详细,有时间再写一篇原理文章

[csharp]  view plain copy print ?
  1. /// <summary>  
  2.    /// 决策树算法类,不适合连续性值。  
  3.    /// </summary>  
  4.    public class DecisionTreeAlg  
  5.    {  
  6.        private string PrefixString = "                                                                                                                                                                                                       ";  
  7.        /// <summary>  
  8.        /// 构建决策树,决策分类属性约定放在第1列。  
  9.        /// </summary>  
  10.        /// <param name="Inputs">行表示属性,列为值,注意列等长</param>  
  11.        /// <param name="PNode">父节点</param>  
  12.        /// <param name="PropertyNames">测试属性名称</param>  
  13.        /// <param name="TestProperties">当前可用测试属性索引</param>  
  14.        /// <param name="DefaultClassFactor">缺省判别决策分类因子</param>  
  15.        /// <param name="CallLevel">用来测试输出控制,无实际作用</param>  
  16.        /// <param name="OutContents">输出内容,为调试用</param>  
  17.        /// <param name="PropertyFactors">属性因子</param>  
  18.        public void BuildDecisionTree(int CallLevel, ref string OutContents, string[][] Inputs, DecisionTreeNode PNode, string[] PropertyNames, List<int> TestProperties, string DefaultClassFactor, Dictionary<string, List<string>> PropertyFactors)  
  19.        {  
  20.              
  21.            string thePrefix = PrefixString.Substring(0, CallLevel * 2);  
  22.            CallLevel++;  
  23.            //如果没有测试属性,将当前节点设为叶子节点,选择高概率分类,然后返回  
  24.            if (TestProperties.Count <= 1)  
  25.            {  
  26.                PNode.Type = "叶子";  
  27.                PNode.DecisionValue = DefaultClassFactor;  
  28.                return;  
  29.            }  
  30.            //如果没有学习样本集,将当前节点设为叶子节点,选择高概率分类,然后返回  
  31.            if (PNode.DataIndexes.Count <= 0)  
  32.            {  
  33.                PNode.Type = "叶子";  
  34.                PNode.DecisionValue = DefaultClassFactor;  
  35.                return;  
  36.            }  
  37.   
  38.            if (PropertyFactors == null)  
  39.            {  
  40.                PropertyFactors = new Dictionary<string, List<string>>();  
  41.            }  
  42.            //准备存储遍历时的计数存储结构  
  43.            Dictionary<string, CalcNode> thePropertyCount = new Dictionary<string, CalcNode>();  
  44.            foreach (var theProIndex in TestProperties)  
  45.            {  
  46.                thePropertyCount.Add(PropertyNames[theProIndex], new CalcNode() { Key = PropertyNames[theProIndex] });  
  47.                if (PropertyFactors.ContainsKey(PropertyNames[theProIndex]) == false)  
  48.                {  
  49.                    PropertyFactors.Add(PropertyNames[theProIndex], new List<string>());  
  50.                }  
  51.            }  
  52.            //遍历当前可遍历的数据,进行统计,为计算各属性熵做准备  
  53.            for (int n = 0; n < PNode.DataIndexes.Count; n++)  
  54.            {  
  55.                int theI = PNode.DataIndexes[n];  
  56.                for (int k = 0; k < TestProperties.Count; k++)  
  57.                {  
  58.                    int theJ = TestProperties[k];  
  59.                    var thePropertyCalcNode = thePropertyCount[PropertyNames[theJ]];  
  60.                    //对当前属性计数  
  61.                    thePropertyCalcNode.CalcCount++;  
  62.                    //对第j个属性的当前因子计数  
  63.                    thePropertyCalcNode.AddChildren(Inputs[theJ][theI], "测试属性因子", theI, 1);  
  64.                    //对第j个属性的当前因子的主分类因子计数  
  65.                    thePropertyCalcNode.Children[Inputs[theJ][theI]].AddChildren(Inputs[0][theI], "主分类因子", theI, 1);  
  66.                    //统计归纳各属性因子,采用这种方式可以减少循环.  
  67.                    if (PropertyFactors[PropertyNames[theJ]].Contains(Inputs[theJ][theI]) == false)  
  68.                    {  
  69.                        PropertyFactors[PropertyNames[theJ]].Add(Inputs[theJ][theI]);  
  70.                    }  
  71.                }  
  72.            }  
  73.              
  74.            //计算信息增益量,获取具有最大信息增益属性  
  75.            string theDefaultClassFactor = DefaultClassFactor;  
  76.            //初始化最大测试属性熵值.  
  77.            double theMaxEA = double.MinValue;  
  78.            //记录具有最大熵值属性的索引位置  
  79.            int theMaxPropertyIndex = TestProperties[1];  
  80.            //总信息熵值,其实就是分类属性的熵值.  
  81.            double theTotalEA = 0.0;  
  82.            //记录总的样本数,用于估算概率.  
  83.            double theTotalSimple = 0;  
  84.   
  85.            for(int theI=0;theI<TestProperties.Count;theI++)  
  86.            {  
  87.                int thePIndex_1 = TestProperties[theI];  
  88.                if (thePIndex_1 == 0)  
  89.                {  
  90.                    //主分类熵值计算,计算公式与测试属性有所不同.  
  91.                    CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];  
  92.                    double theCount = theCalcNode.CalcCount;  
  93.                    theTotalSimple = theCount;  
  94.                    double theMaxSubCount = -1;  
  95.                    theTotalEA = 0.0;  
  96.                    //求和(-Pj*log2(Pj))  
  97.                    foreach (var theSubNode in theCalcNode.Children)  
  98.                    {  
  99.                        if (theSubNode.Value.CalcCount > 0)  
  100.                        {  
  101.                            double thePj = theSubNode.Value.CalcCount / theCount;  
  102.                            theTotalEA += 0 - thePj * Math.Log(thePj, 2);  
  103.                        }  
  104.                        if (theMaxSubCount < theSubNode.Value.CalcCount)  
  105.                        {  
  106.                            theMaxSubCount = theSubNode.Value.CalcCount;  
  107.                            theDefaultClassFactor = theSubNode.Key;  
  108.                        }  
  109.                        //测试输出,跟踪计算路径.  
  110.                        OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + ":: " + theSubNode.Value.Type + " :: " + theSubNode.Key + " :: " + theSubNode.Value.CalcCount;   
  111.   
  112.                    }  
  113.                }  
  114.                else  
  115.                {  
  116.                    //测试属性熵值计算。  
  117.                    CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];  
  118.                    double theJEA = 0.0;  
  119.                    foreach (var theSubNode_1 in theCalcNode.Children)  
  120.                    {  
  121.                        if (theSubNode_1.Value.CalcCount > 0)  
  122.                        {  
  123.                            double theSjCount = theSubNode_1.Value.CalcCount;  
  124.                            double theSj_1 = theSjCount / theTotalSimple;  
  125.                            double theSj_2 = 0.0;  
  126.                              
  127.                            foreach (var theSubNode_2 in theSubNode_1.Value.Children)  
  128.                            {  
  129.                                if (theSubNode_2.Value.CalcCount > 0)  
  130.                                {  
  131.                                    double thePj_1 = Convert.ToDouble(theSubNode_2.Value.CalcCount) / theSjCount;  
  132.                                    theSj_2 += 0.0 - thePj_1 * Math.Log(thePj_1, 2);  
  133.                                }  
  134.                                OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + " :: " + theSubNode_1.Value.Type + " :: " + theSubNode_1.Key + " :: " + theSubNode_1.Value.CalcCount  
  135.                                     + theSubNode_2.Value.Type + " :: "  + theSubNode_2.Key + " :: " + theSubNode_2.Value.CalcCount;   
  136.                            }  
  137.                            theJEA += theSj_1 * theSj_2;  
  138.                        }  
  139.                          
  140.                    }  
  141.                    theJEA = theTotalEA - theJEA;  
  142.                    //只记录最大熵值属性信息.  
  143.                    if (theMaxEA < theJEA)  
  144.                    {  
  145.                        theMaxEA = theJEA;  
  146.                        theMaxPropertyIndex = thePIndex_1;  
  147.                    }  
  148.                }  
  149.            }  
  150.            //如果分类因子只有一个,则置当前节点为叶子节点,设置判定为当前分类因子,然后返回  
  151.            if (thePropertyCount[PropertyNames[0]].Children.Count <= 1)  
  152.            {  
  153.                PNode.Type = "叶子";  
  154.                PNode.DecisionValue = theDefaultClassFactor;  
  155.                return;  
  156.            }  
  157.            //具有多个分类因子,还剩有测试属性,则设当前节点为分支节点,准备分支.  
  158.            PNode.Type = "分支";  
  159.            //1选取最大增益信息量测试属性,做分支处理,做处理,注意属性一旦处理,将不在后续节点中再处理  
  160.            //因此需要在测试属性集合中删除所选测试属性.注意保持分类属性在开始索引处(0).  
  161.            PNode.Key = PropertyNames[theMaxPropertyIndex];  
  162.   
  163.             CalcNode theCalcNode_2 = thePropertyCount[PropertyNames[theMaxPropertyIndex]];  
  164.             List<string> theFactors = PropertyFactors[PropertyNames[theMaxPropertyIndex]];  
  165.             List<int> theAvailableTestPs = new List<int>();  
  166.             for (int i = 0; i < TestProperties.Count; i++)  
  167.             {  
  168.                 if (theMaxPropertyIndex != TestProperties[i])  
  169.                 {  
  170.                     theAvailableTestPs.Add(TestProperties[i]);  
  171.                 }  
  172.             }  
  173.             //对所选测试属性的所有因子进行处理.  
  174.             foreach (var theFactor_1 in theFactors)  
  175.             {  
  176.                 //如果当前因子不在计算中,则添加一个叶子节点,判定为高概率分类。  
  177.                 if (theCalcNode_2.Children.ContainsKey(theFactor_1) == false)  
  178.                 {  
  179.                     DecisionTreeNode theNode_1 = new DecisionTreeNode();  
  180.                     theNode_1.ParentFactor = theFactor_1;  
  181.                     theNode_1.CalcCount = 0;  
  182.                     theNode_1.DecisionValue = theDefaultClassFactor;  
  183.                     theNode_1.Parent = PNode;  
  184.                     theNode_1.Key = theFactor_1;  
  185.                     theNode_1.Type = "叶子";  
  186.                     PNode.Children.Add(theFactor_1, theNode_1);  
  187.                     continue;  
  188.                 }  
  189.                 //如果当前因子存在,但不存在样本,则添加一个叶子节点,判定为高概率分类。  
  190.                 if (theCalcNode_2.Children[theFactor_1].CalcCount<=0)  
  191.                 {  
  192.                     DecisionTreeNode theNode_1 = new DecisionTreeNode();  
  193.                     theNode_1.ParentFactor = theFactor_1;  
  194.                     theNode_1.CalcCount = 0;  
  195.                     theNode_1.DecisionValue = theDefaultClassFactor;  
  196.                     theNode_1.Parent = PNode;  
  197.                     theNode_1.Type = "叶子";  
  198.                     theNode_1.Key = theFactor_1;  
  199.                     PNode.Children.Add(theFactor_1, theNode_1);  
  200.                     continue;  
  201.                 }  
  202.                 //如果存在,且有学习样本,则添加一个节点,并以此节点递归处理.  
  203.                 DecisionTreeNode theNode_2 = new DecisionTreeNode();  
  204.                 theNode_2.ParentFactor = theFactor_1;  
  205.                 theNode_2.Parent = PNode;  
  206.                 theNode_2.Key = theFactor_1;  
  207.                 theNode_2.CalcCount = theCalcNode_2.Children[theFactor_1].CalcCount;  
  208.                 theNode_2.DataIndexes.AddRange(theCalcNode_2.Children[theFactor_1].DataIndexes);  
  209.                 PNode.Children.Add(theFactor_1, theNode_2);  
  210.                 BuildDecisionTree(CallLevel, ref OutContents, Inputs, theNode_2, PropertyNames, theAvailableTestPs, theDefaultClassFactor, PropertyFactors);  
  211.             }  
  212.        }  
  213.   
  214.    }  


3、测试代码:

[csharp]  view plain copy print ?
  1. private void button1_Click(object sender, EventArgs e)  
  2.         {  
  3.             DecisionTreeAlg theAlg = new DecisionTreeAlg();  
  4.             string[][] theInputs = new string[4][];  
  5.             theInputs[0] = new string[] { "no""yes""yes""yes""yes""yes""no""yes""yes""no" };  
  6.             theInputs[1] = new string[] { "s""s""l""m""l""m""m""l""m""s" };  
  7.             theInputs[2] = new string[] { "s""l""m""m""m""l""s""m""s""s" };  
  8.             theInputs[3] = new string[] { "no""yes""yes""yes""no""no""no""no""no""yes" };  
  9.   
  10.             string[] thePropertyName = new string[] {"是否真实帐号","日志密度","好友密度","是否真实头像" };  
  11.   
  12.             DecisionTreeNode theRootNode = new DecisionTreeNode();  
  13.             theRootNode.DataIndexes.AddRange(new List<int>() { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });  
  14.   
  15.             List<int> theTestPs = new List<int>() { 0, 1, 2, 3 };  
  16.             string theOuts = "";  
  17.             theAlg.BuildDecisionTree(0,ref theOuts, theInputs, theRootNode, thePropertyName, theTestPs, ""null);  
  18.             this.treeView1.Nodes.Clear();  
  19.             TreeNode theRoot = new TreeNode();  
  20.             this.treeView1.Nodes.Add(theRoot);  
  21.             VisitTree(theRoot, theRootNode);  
  22.             this.textBox1.Text = theOuts;  
  23.         }  
  24.         private void VisitTree(TreeNode PNode, DecisionTreeNode PDNode)  
  25.         {  
  26.             PNode.Text = PDNode.Key + "(" + PDNode.Type + ")[判定:"+PDNode.DecisionValue +"]";  
  27.             foreach (var theNode in PDNode.Children.Values)  
  28.             {  
  29.                 TreeNode theTmpNode = new TreeNode();  
  30.                 PNode.Nodes.Add(theTmpNode);  
  31.                 VisitTree(theTmpNode, theNode);  
  32.             }  
  33.         }  

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值