使用Collective Intelligence 中的决策树建模训练kaggle titanic预测模型
Collective Intelligence 第七章,决策树建模
my_data=[['slashdot','USA','yes',18,'None'],
['google','France','yes',23,'Premium'],
['digg','USA','yes',24,'Basic'],
['kiwitobes','France','yes',23,'Basic'],
['google','UK','no',21,'Premium'],
['(direct)','New Zealand','no',12,'None'],
['(direct)','UK','no',21,'Basic'],
['google','USA','no',24,'Premium'],
['slashdot','France','yes',19,'None'],
['digg','USA','no',18,'None'],
['google','UK','no',18,'None'],
['kiwitobes','UK','no',19,'None'],
['digg','New Zealand','yes',12,'Basic'],
['slashdot','UK','no',21,'None'],
['google','UK','yes',18,'Basic'],
['kiwitobes','France','yes',19,'Basic']]
class decisionnode:
def __init__(self,col=1,value=None,result=None,tb=None,fb=None):
self.col=col
self.value=value
self.result=result
self.tb=tb
self.fb=fb
def divideset(rows,column,value):
#定义一个函数。令其告诉我们数据行属于第一组(true)还是第二组(false)
spilt_function=None
if isinstance(value,int) or isinstance(value,float):
spilt_function=lambda row: row[column]>=value
else:
spilt_function=lambda row: row[column]==value
#将数据集拆分成两个集合,并返回
set1=[row for row in rows if spilt_function(row)]
set2=[row for row in rows if not spilt_function(row)]
return (set1,set2)
def uniquecounts(rows):
results={}
for row in rows:
#计数结果在最后一列
r=row[len(row)-1]
if r not in results:
results[r]=0
results[r]+=1
return results
#基尼不纯度
def giniimpurity(rows):
total=len(rows)
counts=uniquecounts(rows)
imp=0
for k1 in counts:
p1=float(counts[k1])/total
for k2 in counts:
if k1==k2: continue
p2=float(counts[k2])/total
imp+=p1*p2
return imp
#熵
def entropy(rows):
from math import log
log2=lambda x: log(x)/log(2)
results=uniquecounts(rows)
#计算熵的值
ent=0.0
for r in list(results.keys()):
p=float(results[r]/len(rows))
ent=ent-p*log2(p)
return ent
def buildtree(rows,scoref=entropy):
if len(rows)==0:
return decisionnode()
current_score=scoref(rows)
#定义一些变量以记录最佳拆分条件
best_gain=0
best_criteria=None
best_sets=None
column_count=len(rows[0])-1
for col in range(0,column_count):
#在当前列中生成一个由不同值构成的序列
column_values={}
for row in rows:
column_values[row[col]]=1
#接下来根据这一列中的每个值,尝试对数据集进行拆分
for value in list(column_values.keys()):
(set1,set2)=divideset(rows,col,value)
#信息增益
p=float(len(set1))/len(rows)
gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
if gain>best_gain and len(set1)>0 and len(set2)>0:
best_gain=gain
best_criteria=(col,value)
best_sets=(set1,set2)
#创建分支
if best_gain>0:
trueBranch=buildtree(best_sets[0])
falseBranch=buildtree(best_sets[1])
return decisionnode(col=best_criteria[0],value=best_criteria[1],\
tb=trueBranch,fb=falseBranch)
else:
return decisionnode(result=uniquecounts(rows))
def printtree(tree,indent=''):
#判断是否叶节点
if tree.result!=None:
print(str(tree.result))
else:
#打印判读条件
print(str(tree.col)+':'+str(tree.value)+'? ')
#打印分支
print(indent+'T->',end='')
printtree(tree.tb,indent+" ")
print(indent+'F->',end='')
printtree(tree.fb,indent+" ")
tree=buildtree(my_data)
printtree(tree)
0:google?
T->3:21?
T->{'Premium': 3}
F->2:no?
T->{'None': 1}
F->{'Basic': 1}
F->0:slashdot?
T->{'None': 3}
F->2:yes?
T->{'Basic': 4}
F->3:21?
T->{'Basic': 1}
F->{'None': 3}
对新的观测数据进行分类
def classify(observation,tree):
if tree.result!=None:
return tree.result
else:
v=observation[tree.col]
branch=None
if isinstance(v,int) or isinstance(v,float):
if v >= tree.value:
branch=tree.tb
else:
branch=tree.fb
else:
if v== tree.value: branch=tree.tb
else: branch=tree.fb
return classify(observation,branch)
classify(['(direct)','USA','yes',5],tree)
{'Basic': 4}
def prune(tree,mingain):
#如果分支不是叶节点,则对其进行剪枝操作
if tree.tb.result==None:
prune(tree.tb,mingain)
if tree.fb.result==None:
prune(tree.fb,mingain)
#如果两个分支都是叶节点,则判断它们是否需要合并
if tree.tb.result!=None and tree.fb.result!=None:
#构造合并后的数据集
tb,fb=[],[]
for v,c in tree.tb.result.items():
tb+=[[v]]*c
for v,c in tree.fb.result.items():
fb+=[[v]]*c
#检查熵的减少情况
delta=entropy(tb+fb)-(entropy(tb)+entropy(fb)/2)
if delta<mingain:
#合并分支
tree.tb,tree.fb=None,None
tree.result=uniquecounts(tb+fb)
prune(tree,1.0)
printtree(tree)
0:google?
T->3:21?
T->{'Premium': 3}
F->2:no?
T->{'None': 1}
F->{'Basic': 1}
F->{'None': 6, 'Basic': 5}
对有缺失数据的样本进行预测
def mdclassify(observation,tree):
if tree.result!=None:
return tree.result
else:
v=observation[tree.col]
if v==None:
tr,fr=mdclassify(observation,tree.tb),mdclassify(observation,\
tree.fb)
tcount=sum(tr.values())
fcount=sum(fr.values())
tw=float(tcount)/(tcount+fcount)
fw=float(fcount)/(tcount+fcount)
result={}
for k,v in tr.items(): result[k]=v*tw
for k,v in fr.items():
if k not in result:
result[k]=0
result[k]+=v*fw
return result
else:
if isinstance(v,int) or isinstance(v,float):
if v>=tree.value: branch=tree.tb
else: branch=tree.fb
else:
if v==tree.value: branch=tree.tb
else: branch=tree.fb
return mdclassify(observation,branch)
mdclassify(['google','None','yes',None],tree)
{'Premium': 2.25, 'Basic': 0.25}
mdclassify(['google','France',None,None],tree)
{'Premium': 2.25, 'None': 0.125, 'Basic': 0.125}
Titanic
读入数据与对数据进行处理的自定义函数
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
trains=pd.read_csv("D:\\%Learning\PYTHON\Titanic\\train.csv")
tests=pd.read_csv("D:\%Learning\PYTHON\Titanic\\test.csv")
def sex(row):
if row.Sex=='male':
row.Sex=0
else:
row.Sex=1
return row
def Pclass(row):
if row.Pclass==1:
row.Pclass=1
else:
row.Pclass=0
return row
def PclassSex(row):
if row.Sex=='male':
row.Pclass+=3
return row
#判断是否为儿童
def age(row):
if row.Age<15:
row.Age=1
else:
row.Age=0
return row
#判断ticket是否在生存率较高的集合中
def Tickk(row):
if row.Ticket in ['P','1','2']:
row.Ticket=1
else:
row.Ticket=0
return row
def Fare(row):
if row.Fare>31.0:
row.Fare=1
elif row.Fare>14.45:
row.Fare=0.75
elif row.Fare>7.91:
row.Fare=0.5
else:
row.Fare=0
return row
def cabin(row):
if row.Cabin != 0:
row.Cabin=1
return row
def family(row):
if row.Family>=4:
row.Family=1
else:
row.Family=0
return row
def married(row):
name=row.Name
for i in name:
if i == '(':
row.Name=0
return row
row.Name=1
return row
def embark(row):
if row.Sex=='female':
if row.Embarked=='C':
row.Embarked=5
elif row.Embarked=='Q':
row.Embarked=4
elif row.Embarked=='S':
row.Embarked=3
elif row.Sex=='male':
if row.Embarked=='C':
row.Embarked=2
elif row.Embarked=='S':
row.Embarked=1
elif row.Embarked=='Q':
row.Embarked=0
return row
use_features=['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch'\
,'Fare','Ticket','Cabin','Embarked']
Train=trains[use_features]
- 数据一览
Train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Fare | Ticket | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | 7.2500 | A/5 21171 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | 71.2833 | PC 17599 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | 7.9250 | STON/O2. 3101282 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 53.1000 | 113803 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 8.0500 | 373450 | NaN | S |
数据清洗与特征处理
#将ticket替换为原文的第一个字符
Train.Embarked.fillna('S',inplace=True)
Train['Ticket']=Train.Ticket.map(lambda x: x[0])
#空缺年龄补为均值
Train.loc[:,'Age']=Train.Age.fillna(30.0)
#年龄feature scaling
#Train.loc[:,'Age']=Train.Age.map(lambda x: (x-30.0)/80)
#将儿童的age设置为1,其余为0
Train=Train.apply(age,axis='columns')
#船费feature scaling
Train=Train.apply(Fare,axis='columns')
#增加Cabinfeature
Train.loc[:,'Cabin']=Train.Cabin.fillna(0)
Train=Train.apply(cabin,axis='columns')
选择了Pclass, Sex, Age, Fare, Cabin, Embarked特征使用
set_features=['Pclass','Sex','Age',\
'Fare','Cabin','Embarked','Survived']
Train=Train[set_features]
将datafram转换为python内置的list
tlist=Train.values.tolist()
- Decision Tree 的建立
tree=buildtree(tlist)
printtree(tree)
1:female?
T->0:3?
T->5:S?
T->3:1.0?
T->2:1?
T->{0: 5, 1: 1}
F->{1: 1, 0: 9}
F->3:0.75?
T->4:1?
T->{1: 2}
F->2:1?
T->{0: 5, 1: 1}
F->{0: 13, 1: 7}
F->2:1?
T->3:0.5?
T->4:1?
T->{0: 1}
F->{1: 3}
F->{0: 1}
F->3:0.5?
T->4:1?
T->{0: 1, 1: 1}
F->{1: 9, 0: 14}
F->{0: 6, 1: 8}
F->3:0.75?
T->4:1?
T->{1: 1}
F->5:Q?
T->{1: 5, 0: 2}
F->2:1?
T->{0: 2, 1: 4}
F->{0: 6, 1: 4}
F->5:C?
T->{1: 6}
F->3:0.5?
T->{1: 1, 0: 1}
F->{1: 18, 0: 6}
F->3:1.0?
T->2:1?
T->0:2?
T->{1: 3}
F->{0: 1, 1: 1}
F->5:S?
T->4:1?
T->{1: 37, 0: 1}
F->{1: 10}
F->{1: 42}
F->2:1?
T->{1: 7}
F->5:Q?
T->{1: 2}
F->4:1?
T->5:S?
T->3:0.75?
T->{1: 5}
F->{1: 6, 0: 1}
F->3:0.75?
T->{0: 1, 1: 1}
F->{1: 1}
F->5:S?
T->3:0.75?
T->{1: 27, 0: 3}
F->{1: 15, 0: 2}
F->{1: 4}
F->4:1?
T->2:1?
T->{1: 7}
F->3:0.5?
T->3:1.0?
T->5:C?
T->{0: 11, 1: 12}
F->5:S?
T->{0: 27, 1: 9}
F->{0: 1}
F->0:3?
T->{1: 1}
F->0:2?
T->5:S?
T->3:0.75?
T->{0: 1}
F->{1: 1}
F->{0: 1}
F->5:C?
T->{0: 4, 1: 4}
F->{0: 9, 1: 11}
F->{0: 8}
F->2:1?
T->0:3?
T->3:0.75?
T->5:S?
T->3:1.0?
T->{0: 10, 1: 1}
F->{0: 3, 1: 4}
F->{0: 5}
F->{1: 3}
F->{1: 6}
F->3:0.5?
T->0:2?
T->5:C?
T->0:3?
T->3:0.75?
T->{0: 4, 1: 4}
F->{0: 2}
F->3:0.75?
T->{0: 7}
F->{1: 1}
F->3:1.0?
T->0:3?
T->{0: 9, 1: 5}
F->{0: 8}
F->3:0.75?
T->5:S?
T->0:3?
T->{0: 29}
F->{0: 19, 1: 1}
F->{0: 6, 1: 1}
F->0:3?
T->5:S?
T->{0: 77, 1: 13}
F->{0: 1}
F->5:S?
T->{1: 5, 0: 48}
F->{0: 1}
F->5:C?
T->3:1.0?
T->{0: 6}
F->{0: 4, 1: 1}
F->3:1.0?
T->{0: 3, 1: 1}
F->{0: 6, 1: 4}
F->0:3?
T->5:C?
T->{0: 26, 1: 4}
F->5:S?
T->{0: 100, 1: 8}
F->{0: 24, 1: 2}
F->{0: 8}
对test数据进行清洗和处理
tests_features=['Pclass','Name','Sex','Age','SibSp','Parch'\
,'Fare','Ticket','Cabin','Embarked']
Test=tests[tests_features]
#将ticket替换为原文的第一个字符
Test.Embarked.fillna('S',inplace=True)
Test['Ticket']=Test.Ticket.map(lambda x: x[0])
#空缺年龄补为均值
Test.loc[:,'Age']=Test.Age.fillna(30.0)
#年龄feature scaling
#将儿童的age设置为1,其余为0
Test=Test.apply(age,axis='columns')
#船费feature scaling
Test=Test.apply(Fare,axis='columns')
#增加Cabinfeature
Test.loc[:,'Cabin']=Test.Cabin.fillna(0)
Test=Test.apply(cabin,axis='columns')
set_features=['Pclass','Sex','Age',\
'Fare','Cabin','Embarked']
Test=Test[set_features]
testlist=Test.values.tolist()
对每一行进行判断并输入result中
result=[]
for i in range(len(testlist)):
res=classify(testlist[i],tree)
if 1 in res and 0 in res:
if res[0]>res[1]:
result.append(0)
else:
result.append(1)
else:
result.append(list(res.keys())[0])
output=np.asarray(result)
output
array([0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
保存结果并上交
output = pd.DataFrame({'PassengerId': tests.PassengerId,
'Survived': result})
output.Survived=output.Survived.astype('int64')
output.to_csv('D:\%Learning\PYTHON\Titanic\\self_made_decisiontree.csv', index=False)
最终kaggle评分为0.75119,这与用sklearn的LogisticRegressor的效果差不多。
使用prune对树进行“修剪”
prune(tree,1.0)
printtree(tree)
1:female?
T->{0: 81, 1: 233}
F->4:1?
T->2:1?
T->{1: 7}
F->3:0.5?
T->3:1.0?
T->{0: 39, 1: 21}
F->0:3?
T->{1: 1}
F->0:2?
T->5:S?
T->3:0.75?
T->{0: 1}
F->{1: 1}
F->{0: 1}
F->{0: 13, 1: 15}
F->{0: 8}
F->{0: 406, 1: 64}
可见整个决策树被删减了很多,女性乘客直接被判断为大概率生还。这个方法在模型overfit的时候有用,但由于该模型并没有overfit所以不需要prune