使用《Programming Collective Intelligence》中的决策树建模训练kaggle——Titanic预测模型

使用Collective Intelligence 中的决策树建模训练kaggle titanic预测模型

Collective Intelligence 第七章,决策树建模

my_data=[['slashdot','USA','yes',18,'None'],
        ['google','France','yes',23,'Premium'],
        ['digg','USA','yes',24,'Basic'],
        ['kiwitobes','France','yes',23,'Basic'],
        ['google','UK','no',21,'Premium'],
        ['(direct)','New Zealand','no',12,'None'],
        ['(direct)','UK','no',21,'Basic'],
        ['google','USA','no',24,'Premium'],
        ['slashdot','France','yes',19,'None'],
        ['digg','USA','no',18,'None'],
        ['google','UK','no',18,'None'],
        ['kiwitobes','UK','no',19,'None'],
        ['digg','New Zealand','yes',12,'Basic'],
        ['slashdot','UK','no',21,'None'],
        ['google','UK','yes',18,'Basic'],
        ['kiwitobes','France','yes',19,'Basic']]


class decisionnode:
    def __init__(self,col=1,value=None,result=None,tb=None,fb=None):
        self.col=col
        self.value=value
        self.result=result
        self.tb=tb
        self.fb=fb
    
def divideset(rows,column,value):
    #定义一个函数。令其告诉我们数据行属于第一组(true)还是第二组(false)
    spilt_function=None
    if isinstance(value,int) or isinstance(value,float):
        spilt_function=lambda row: row[column]>=value
    else:
        spilt_function=lambda row: row[column]==value
    
    #将数据集拆分成两个集合,并返回
    set1=[row for row in rows if spilt_function(row)]
    set2=[row for row in rows if not spilt_function(row)]
    return (set1,set2)

def uniquecounts(rows):
    results={}
    for row in rows:
        #计数结果在最后一列
        r=row[len(row)-1]
        if r not in results:
            results[r]=0
        results[r]+=1
    return results

#基尼不纯度
def giniimpurity(rows):
    total=len(rows)
    counts=uniquecounts(rows)
    imp=0
    for k1 in counts:
        p1=float(counts[k1])/total
        for k2 in counts:
            if k1==k2: continue
            p2=float(counts[k2])/total
            imp+=p1*p2
    return imp

#熵
def entropy(rows):
    from math import log
    log2=lambda x: log(x)/log(2)
    results=uniquecounts(rows)
    #计算熵的值
    ent=0.0
    for r in list(results.keys()):
        p=float(results[r]/len(rows))
        ent=ent-p*log2(p)
    return ent

def buildtree(rows,scoref=entropy):
    if len(rows)==0:
        return decisionnode()
    current_score=scoref(rows)
    #定义一些变量以记录最佳拆分条件
    best_gain=0
    best_criteria=None
    best_sets=None

    column_count=len(rows[0])-1
    for col in range(0,column_count):
        #在当前列中生成一个由不同值构成的序列
        column_values={}
        for row in rows:
            column_values[row[col]]=1
        #接下来根据这一列中的每个值,尝试对数据集进行拆分
        for value in list(column_values.keys()):
            (set1,set2)=divideset(rows,col,value)
            #信息增益
            p=float(len(set1))/len(rows)
            gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
            if gain>best_gain and len(set1)>0 and len(set2)>0:
                best_gain=gain
                best_criteria=(col,value)
                best_sets=(set1,set2)
    #创建分支
    if best_gain>0:
        trueBranch=buildtree(best_sets[0])
        falseBranch=buildtree(best_sets[1])
        return decisionnode(col=best_criteria[0],value=best_criteria[1],\
                            tb=trueBranch,fb=falseBranch)
    else:
        return decisionnode(result=uniquecounts(rows))
    

def printtree(tree,indent=''):
    #判断是否叶节点
    if tree.result!=None:
        print(str(tree.result))
    else:
        #打印判读条件
        print(str(tree.col)+':'+str(tree.value)+'? ')
        #打印分支
        print(indent+'T->',end='')
        printtree(tree.tb,indent+"  ")
        print(indent+'F->',end='')
        printtree(tree.fb,indent+"  ")
tree=buildtree(my_data)
printtree(tree)
0:google? 
T->3:21? 
  T->{'Premium': 3}
  F->2:no? 
    T->{'None': 1}
    F->{'Basic': 1}
F->0:slashdot? 
  T->{'None': 3}
  F->2:yes? 
    T->{'Basic': 4}
    F->3:21? 
      T->{'Basic': 1}
      F->{'None': 3}

对新的观测数据进行分类

def classify(observation,tree):
    if tree.result!=None:
        return tree.result
    else:
        v=observation[tree.col]
        branch=None
        if isinstance(v,int) or isinstance(v,float):
            if v >= tree.value:
                branch=tree.tb
            else:
                branch=tree.fb
        else:
            if v== tree.value: branch=tree.tb
            else: branch=tree.fb
        return classify(observation,branch)
classify(['(direct)','USA','yes',5],tree)
{'Basic': 4}
def prune(tree,mingain):
    #如果分支不是叶节点,则对其进行剪枝操作
    if tree.tb.result==None:
        prune(tree.tb,mingain)
    if tree.fb.result==None:
        prune(tree.fb,mingain)
        
    #如果两个分支都是叶节点,则判断它们是否需要合并
    if tree.tb.result!=None and tree.fb.result!=None:
        #构造合并后的数据集
        tb,fb=[],[]
        for v,c in tree.tb.result.items():
            tb+=[[v]]*c
        for v,c in tree.fb.result.items():
            fb+=[[v]]*c
        #检查熵的减少情况
        delta=entropy(tb+fb)-(entropy(tb)+entropy(fb)/2)
        if delta<mingain:
            #合并分支
            tree.tb,tree.fb=None,None
            tree.result=uniquecounts(tb+fb)
prune(tree,1.0)
printtree(tree)
0:google? 
T->3:21? 
  T->{'Premium': 3}
  F->2:no? 
    T->{'None': 1}
    F->{'Basic': 1}
F->{'None': 6, 'Basic': 5}

对有缺失数据的样本进行预测

def mdclassify(observation,tree):
    if tree.result!=None:
        return tree.result
    else:
        v=observation[tree.col]
        if v==None:
            tr,fr=mdclassify(observation,tree.tb),mdclassify(observation,\
                                                            tree.fb)
            tcount=sum(tr.values())
            fcount=sum(fr.values())
            tw=float(tcount)/(tcount+fcount)
            fw=float(fcount)/(tcount+fcount)
            result={}
            for k,v in tr.items(): result[k]=v*tw
            for k,v in fr.items():
                if k not in result:
                    result[k]=0
                result[k]+=v*fw
            return result
        else:
            if isinstance(v,int) or isinstance(v,float):
                if v>=tree.value: branch=tree.tb
                else: branch=tree.fb
            else:
                if v==tree.value: branch=tree.tb
                else: branch=tree.fb
            return mdclassify(observation,branch)
        
mdclassify(['google','None','yes',None],tree)
{'Premium': 2.25, 'Basic': 0.25}
mdclassify(['google','France',None,None],tree)
{'Premium': 2.25, 'None': 0.125, 'Basic': 0.125}

Titanic

读入数据与对数据进行处理的自定义函数

import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split

trains=pd.read_csv("D:\\%Learning\PYTHON\Titanic\\train.csv")
tests=pd.read_csv("D:\%Learning\PYTHON\Titanic\\test.csv")

def sex(row):
    if row.Sex=='male':
        row.Sex=0
    else:
        row.Sex=1
    return row

def Pclass(row):
    if row.Pclass==1:
        row.Pclass=1
    else:
        row.Pclass=0
    return row

def PclassSex(row):
    if row.Sex=='male':
        row.Pclass+=3
    return row

#判断是否为儿童
def age(row):
    if row.Age<15:
        row.Age=1
    else:
        row.Age=0
    return row

#判断ticket是否在生存率较高的集合中
def Tickk(row):
    if row.Ticket in ['P','1','2']:
        row.Ticket=1
    else:
        row.Ticket=0
    return row

def Fare(row):
    if row.Fare>31.0:
        row.Fare=1
    elif row.Fare>14.45:
        row.Fare=0.75
    elif row.Fare>7.91:
        row.Fare=0.5
    else:
        row.Fare=0
    return row


def cabin(row):
    if row.Cabin != 0:
        row.Cabin=1
    return row

def family(row):
    if row.Family>=4:
        row.Family=1
    else:
        row.Family=0
    return row

def married(row):
    name=row.Name
    for i in name:
        if i == '(':
            row.Name=0
            return row
    row.Name=1
    return row

def embark(row):
    if row.Sex=='female':
        if row.Embarked=='C':
            row.Embarked=5
        elif row.Embarked=='Q':
            row.Embarked=4
        elif row.Embarked=='S':
            row.Embarked=3
    elif row.Sex=='male':
        if row.Embarked=='C':
            row.Embarked=2
        elif row.Embarked=='S':
            row.Embarked=1
        elif row.Embarked=='Q':
            row.Embarked=0
    return row


use_features=['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch'\
    ,'Fare','Ticket','Cabin','Embarked']

Train=trains[use_features]
  • 数据一览
Train.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchFareTicketCabinEmbarked
0103Braund, Mr. Owen Harrismale22.0107.2500A/5 21171NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.01071.2833PC 17599C85C
2313Heikkinen, Miss. Lainafemale26.0007.9250STON/O2. 3101282NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01053.1000113803C123S
4503Allen, Mr. William Henrymale35.0008.0500373450NaNS

数据清洗与特征处理

#将ticket替换为原文的第一个字符
Train.Embarked.fillna('S',inplace=True)
Train['Ticket']=Train.Ticket.map(lambda x: x[0])

#空缺年龄补为均值
Train.loc[:,'Age']=Train.Age.fillna(30.0)
#年龄feature scaling
#Train.loc[:,'Age']=Train.Age.map(lambda x: (x-30.0)/80)
#将儿童的age设置为1,其余为0
Train=Train.apply(age,axis='columns')

#船费feature scaling
Train=Train.apply(Fare,axis='columns')

#增加Cabinfeature
Train.loc[:,'Cabin']=Train.Cabin.fillna(0)
Train=Train.apply(cabin,axis='columns')

选择了Pclass, Sex, Age, Fare, Cabin, Embarked特征使用

set_features=['Pclass','Sex','Age',\
    'Fare','Cabin','Embarked','Survived']
Train=Train[set_features]

将datafram转换为python内置的list

tlist=Train.values.tolist()
  • Decision Tree 的建立
tree=buildtree(tlist)
printtree(tree)
1:female? 
T->0:3? 
  T->5:S? 
    T->3:1.0? 
      T->2:1? 
        T->{0: 5, 1: 1}
        F->{1: 1, 0: 9}
      F->3:0.75? 
        T->4:1? 
          T->{1: 2}
          F->2:1? 
            T->{0: 5, 1: 1}
            F->{0: 13, 1: 7}
        F->2:1? 
          T->3:0.5? 
            T->4:1? 
              T->{0: 1}
              F->{1: 3}
            F->{0: 1}
          F->3:0.5? 
            T->4:1? 
              T->{0: 1, 1: 1}
              F->{1: 9, 0: 14}
            F->{0: 6, 1: 8}
    F->3:0.75? 
      T->4:1? 
        T->{1: 1}
        F->5:Q? 
          T->{1: 5, 0: 2}
          F->2:1? 
            T->{0: 2, 1: 4}
            F->{0: 6, 1: 4}
      F->5:C? 
        T->{1: 6}
        F->3:0.5? 
          T->{1: 1, 0: 1}
          F->{1: 18, 0: 6}
  F->3:1.0? 
    T->2:1? 
      T->0:2? 
        T->{1: 3}
        F->{0: 1, 1: 1}
      F->5:S? 
        T->4:1? 
          T->{1: 37, 0: 1}
          F->{1: 10}
        F->{1: 42}
    F->2:1? 
      T->{1: 7}
      F->5:Q? 
        T->{1: 2}
        F->4:1? 
          T->5:S? 
            T->3:0.75? 
              T->{1: 5}
              F->{1: 6, 0: 1}
            F->3:0.75? 
              T->{0: 1, 1: 1}
              F->{1: 1}
          F->5:S? 
            T->3:0.75? 
              T->{1: 27, 0: 3}
              F->{1: 15, 0: 2}
            F->{1: 4}
F->4:1? 
  T->2:1? 
    T->{1: 7}
    F->3:0.5? 
      T->3:1.0? 
        T->5:C? 
          T->{0: 11, 1: 12}
          F->5:S? 
            T->{0: 27, 1: 9}
            F->{0: 1}
        F->0:3? 
          T->{1: 1}
          F->0:2? 
            T->5:S? 
              T->3:0.75? 
                T->{0: 1}
                F->{1: 1}
              F->{0: 1}
            F->5:C? 
              T->{0: 4, 1: 4}
              F->{0: 9, 1: 11}
      F->{0: 8}
  F->2:1? 
    T->0:3? 
      T->3:0.75? 
        T->5:S? 
          T->3:1.0? 
            T->{0: 10, 1: 1}
            F->{0: 3, 1: 4}
          F->{0: 5}
        F->{1: 3}
      F->{1: 6}
    F->3:0.5? 
      T->0:2? 
        T->5:C? 
          T->0:3? 
            T->3:0.75? 
              T->{0: 4, 1: 4}
              F->{0: 2}
            F->3:0.75? 
              T->{0: 7}
              F->{1: 1}
          F->3:1.0? 
            T->0:3? 
              T->{0: 9, 1: 5}
              F->{0: 8}
            F->3:0.75? 
              T->5:S? 
                T->0:3? 
                  T->{0: 29}
                  F->{0: 19, 1: 1}
                F->{0: 6, 1: 1}
              F->0:3? 
                T->5:S? 
                  T->{0: 77, 1: 13}
                  F->{0: 1}
                F->5:S? 
                  T->{1: 5, 0: 48}
                  F->{0: 1}
        F->5:C? 
          T->3:1.0? 
            T->{0: 6}
            F->{0: 4, 1: 1}
          F->3:1.0? 
            T->{0: 3, 1: 1}
            F->{0: 6, 1: 4}
      F->0:3? 
        T->5:C? 
          T->{0: 26, 1: 4}
          F->5:S? 
            T->{0: 100, 1: 8}
            F->{0: 24, 1: 2}
        F->{0: 8}

对test数据进行清洗和处理

tests_features=['Pclass','Name','Sex','Age','SibSp','Parch'\
    ,'Fare','Ticket','Cabin','Embarked']
Test=tests[tests_features]
#将ticket替换为原文的第一个字符
Test.Embarked.fillna('S',inplace=True)
Test['Ticket']=Test.Ticket.map(lambda x: x[0])

#空缺年龄补为均值
Test.loc[:,'Age']=Test.Age.fillna(30.0)

#年龄feature scaling
#将儿童的age设置为1,其余为0
Test=Test.apply(age,axis='columns')

#船费feature scaling
Test=Test.apply(Fare,axis='columns')

#增加Cabinfeature
Test.loc[:,'Cabin']=Test.Cabin.fillna(0)
Test=Test.apply(cabin,axis='columns')
set_features=['Pclass','Sex','Age',\
    'Fare','Cabin','Embarked']
Test=Test[set_features]
testlist=Test.values.tolist()

对每一行进行判断并输入result中

result=[]
for i in range(len(testlist)):
    res=classify(testlist[i],tree)
    if 1 in res and 0 in res:
        if res[0]>res[1]:
            result.append(0)
        else:
            result.append(1)
    else:
        result.append(list(res.keys())[0])
output=np.asarray(result)
output
array([0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])

保存结果并上交

output = pd.DataFrame({'PassengerId': tests.PassengerId,
                       'Survived': result})
output.Survived=output.Survived.astype('int64')
output.to_csv('D:\%Learning\PYTHON\Titanic\\self_made_decisiontree.csv', index=False)

最终kaggle评分为0.75119,这与用sklearn的LogisticRegressor的效果差不多。

使用prune对树进行“修剪”

prune(tree,1.0)
printtree(tree)
1:female? 
T->{0: 81, 1: 233}
F->4:1? 
  T->2:1? 
    T->{1: 7}
    F->3:0.5? 
      T->3:1.0? 
        T->{0: 39, 1: 21}
        F->0:3? 
          T->{1: 1}
          F->0:2? 
            T->5:S? 
              T->3:0.75? 
                T->{0: 1}
                F->{1: 1}
              F->{0: 1}
            F->{0: 13, 1: 15}
      F->{0: 8}
  F->{0: 406, 1: 64}

可见整个决策树被删减了很多,女性乘客直接被判断为大概率生还。这个方法在模型overfit的时候有用,但由于该模型并没有overfit所以不需要prune

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值