一、数据来源
1.数据来源:kaggle-competitions
2.数据分析说明
该竞赛提供了两份文件,一份为训练集数据,另一份为测试集数据。
通过训练集数据训练模型,然后预测测试集里每一位乘客是否存活,提交后与实际数据进行匹配和评估。
3.数据样式和数据的简单分析
1)训练数据集
最终输出的结果是’Survived’,0表示死亡,1表示存活。
Cabin:缺失数据太多,删除。
Ticket:船票号,因为已有舱等级的分类,故也删除。
Name:本次仅做简单分析,故删除。
有缺失数据部分(Age,Embarked),后续进行前值填充处理。
2)测试集数据
下图为测试集数据,通过上述训练集训练模型后,预测每一位乘客是否存活,需添加’Survived’列。
二、使用方法
CART分类决策树
方法说明:
阐述清楚需要相当的篇幅,故不在本次文章阐述,在代码实现部分进行局部解释。
三、代码实现
从csv文件开始,不调取三方库,纯手工推导。
1.导入基础库
#1.导入基础库
from random import seed
from random import randrange
from csv import reader
import copy
import pandas as pd
2.数据预处理
数据预处理后,保存为新的csv文件,后续模型使用时读取文件即可。
#测试数据的处理
train_file='./download_datas/predict_servival_on_Titanic/train.csv'
df_train=pd.read_csv(train_file,index_col='PassengerId')
df_train['Survived_col']=df_train['Survived'] #输出结果放置在最后一列,以便于后续模型的使用
df_train.drop(['Name','Cabin','Survived','Ticket'],axis=1,inplace=True) #删除部分列表
df_train.fillna(method='ffill',inplace=True) #用前置数据填充缺失值
df_train.to_csv('./download_datas/predict_servival_on_Titanic/train_2.csv') #存储文件
#测试数据的处理
test_file='./download_datas/predict_servival_on_Titanic/test.csv'
df_test=pd.read_csv(test_file,index_col='PassengerId')
df_test['Survived']=None #输出结果占位
df_test.drop(['Name','Cabin','Ticket'],axis=1,inplace=True) #删除部分列表
df_test.fillna(method='ffill',inplace=True) #用前置数据填充缺失值
df_test.to_csv('./download_datas/predict_servival_on_Titanic/test_2.csv') #存储文件
3.csv文件读取和转换数据类型
因为在建立模型时,年龄和票价用 " 是否小于 " 判断,其他数据用 “ 是否等于”判断;
所以,这里只把年龄(Age)和票价(Fare)转换为浮点型。
#3.csv文件读取和转换数据类型
def csv_loader(file):
dataset=list()
with open(file,'r') as f:
csv_reader=reader(f)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
#因为后面的判断条件需要选择'<'还是'= or !=',这里只把年龄和票价进行浮点型转换
def str_to_float_converter(dataset):
dataset=dataset[1:]
for row in dataset:
row[2]=float(row[2].strip())
row[5]=float(row[5].strip())
4.K折交叉验证拆分数据
#4.K折交叉验证拆分数据
def k_fold_split(dataset,n_folds):
dataset_split=list()
folds_size=int(len(dataset)/n_folds)
dataset_copy=list(dataset)
for fold in range(n_folds):
fold=list()
while len(fold) < folds_size:
index= randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
5.计算准确性
#5.计算准确性
def accuracy_model(actual,predicted):
correct=0
for i in range(len(actual)):
if actual[i]== predicted[i]:
correct +=1
accuracy = correct/float(len(actual)) * 100.0
return accuracy
6.通过引用K-fold交叉检验检验模型的准确性
#6.通过引用K-fold交叉检验检验模型的准确性
def test_model(dataset,algo,n_folds,*args):
dataset_split=k_fold_split(dataset,n_folds)
scores = list()
for fold in dataset_split:
train =copy.deepcopy(dataset_split)
train.remove(fold)
train=sum(train,[])
test= copy.deepcopy(fold)
predicted=algo(train,test,*args)
actual =[row[-1] for row in fold]
accuracy = accuracy_model(actual,predicted)
scores.append(accuracy)
return scores
7.判断条件
先判断是否是字符串,然后选择判断条件,符合条件的数据放在左边的列表,不符合的放在右边的列表。
#7.先判断是否是字符串,然后选择判断条件
#符合条件的数据放在左边的列表,不符合的放在右边的列表
def split_group(index,value,dataset):
left, right =list(), list()
for row in dataset:
if isinstance(row[index],str):
if row[index] == value:
left.append(row)
else:
right.append(row)
else:
if row[index] < value:
left.append(row)
else:
right.append(row)
groups=(left, right)
return groups
8.计算组群的基尼系数
在筛选最小基尼系数的组群时使用
#8.计算组群的基尼系数(在筛选最小基尼系数的组群时使用)
def gini_index_groups(groups,class_value):
n_instances = float(sum([len(group) for group in groups])) #计算有多少个实例
#计算每一组的gini
gini = 0.0
for group in groups:
size = float(len(group))
if size == 0 :
continue
score = 0.0
for value in class_value:
p = [row[-1] for row in group].count(value)/size
score += p**2
gini +=(1 - score)*(size/n_instances)
return gini
9.计算单组的基尼系数
在判定每一组是否分裂到最末端时使用
#9.计算单组的基尼系数,在判定是否分裂到最末端时使用
def gini_index_group(group):
size = float(len(group))
if size == 0 :
return
class_value=list(set([row[-1] for row in group]))
gini = 0.0
score = 0.0
for value in class_value:
p = [row[-1] for row in group].count(value)/size
score += p**2
gini += 1- score
return 'T' if gini == 0.0 else gini
10.找出最小的基尼系数,获得对应的节点
#10.找出最小的基尼系数,获得对应的节点
def get_node(dataset):
if len(dataset[0])==1:
return
class_value=list(set([row[-1] for row in dataset]))
posi_index, posi_value, posi_gini, posi_groups =888, 888, 888, None
for index in range(len(dataset[0])-1):
for row in dataset:
value = row[index]
groups = split_group(index,value,dataset)
gini = gini_index_groups(groups,class_value)
if gini < posi_gini :
posi_index, posi_value, posi_gini, posi_groups = index , value, gini, groups
node = {'index':posi_index, 'value':posi_value, 'groups':posi_groups}
return node
11.设置决策树末端的值
#11.设置决策树末端的值
def determined_terminal_value(group):
outconmes=[row[-1] for row in group]
terminal_value = max(set(outconmes), key =outconmes.count)
return terminal_value
12.创建决策树的拆分
def split(node,max_depth,min_size,depth):
if not node: #如果是空值,表示没有该组
return
left, right =node['groups']
del node['groups'] #删除节点里的组群
for row_l in left:
del row_l[node['index']] #删除每组里已裂变的的索引值
for row_r in right:
del row_r[node['index']] #删除每组里已裂变的的索引值
gini_left = gini_index_group(left)
gini_right = gini_index_group(right)
if not gini_left: #部分参数已经分裂完了
pass
elif gini_left == 'T': #组的基尼系数等于0时的情况
node['left'] = determined_terminal_value(left)
elif depth == max_depth: #深度已经达到设置的最大深度
node['left'] = determined_terminal_value(left)
elif len(left) <= min_size: #数据数量已经达到设置的最小数量
node['left'] = determined_terminal_value(left)
else:
node['left'] = get_node(left) #继续分裂
split(node['left'],max_depth,min_size,depth+1)
if not node['left']:
node['left'] = determined_terminal_value(left)
if not gini_right:
pass
elif gini_right == 'T':
node['right'] = determined_terminal_value(right)
elif depth == max_depth:
node['right'] = determined_terminal_value(right)
elif len(right) <= min_size:
node['right'] = determined_terminal_value(right)
else:
node['right'] = get_node(right)
split(node['right'],max_depth,min_size,depth+1)
if not node['right']:
node['right'] = determined_terminal_value(right)
13.创建决策树
#13.创建决策树
def build_tree(train,max_depth,min_size):
node = get_node(train)
split(node,max_depth,min_size,1)
return node
14.对一行数据进行预测的模型
def predict_one_row(row,model):
if isinstance(row[model['index']],str):
if row[model['index']] == model['value']:
predict= model['left']
del row[model['index']]
if isinstance(predict,dict):
return predict_one_row(row,model['left'])
else:
return predict
else:
predict = model['right']
del row[model['index']]
if isinstance(predict, dict):
return predict_one_row(row,model['right'])
else:
return predict
else:
if row[model['index']] < model['value']:
predict = model['left']
del row[model['index']]
if isinstance(predict, dict):
return predict_one_row(row,model['left'])
else:
return predict
else:
predict = model['right']
del row[model['index']]
if isinstance(predict, dict):
return predict_one_row(row,model['right'])
else:
return predict
15.对整体数据进行训练和测试
#15.对整体数据进行训练和测试
def predicted(train,test,max_depth,min_size):
predicted=list()
model = build_tree(train,max_depth,min_size)
for row in test:
predict=predict_one_row(row,model)
predicted.append(predict)
return predicted
16.对目标数据进行预测
#16.对目标数据进行预测
def predict_target(dataset,target,max_depth,min_size):
predictions=list()
model = build_tree(dataset,max_depth,min_size)
for row in target:
predict=predict_one_row(row,model)
predictions.append(predict)
return predictions
17.模型运行测试与参数调整
#设置模型训练的文件数据
file='./download_datas/predict_servival_on_Titanic/train_2.csv'
dataset=csv_loader(file)
for row in dataset:
del row[0]
str_to_float_converter(dataset)
dataset=dataset[1:]
#模型和目标文件的参数均为统一
seed(3)
max_depth=100
min_size=20
n_folds=3
#模型测试
algo=predicted
scores=test_model(dataset,algo,n_folds,max_depth,min_size)
print('The scores of our model is : %s' % scores)
print('The average score of our model is : %.3f%%' % (sum(scores)/float(len(scores))))
模型测试结果
#结果输出
The scores of our model are : [81.14478114478115, 80.8080808080808, 78.78787878787878]
The average score of our model is : 80.247%
目标数据的预测
对平台提供的测试集数据进行预测。
#设置目标文件的文件数据
file_target='./download_datas/predict_servival_on_Titanic/test_2.csv'
target=csv_loader(file_target)
for row in target:
del row[0]
str_to_float_converter(target)
target=target[1:]
#使用的预测模型
predictions=predict_target(dataset,target,max_depth,min_size)
df_sub=pd.read_csv('./download_datas/predict_servival_on_Titanic/test_2.csv')
df_sub['Survived']= predictions #预测的数据添加在最后一列
df_sub.drop(['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'],axis=1,inplace=True) #删除多余的了列表,仅保留PassengerId和Survived
df_sub.set_index('PassengerId',inplace=True)
df_sub.to_csv('./download_datas/predict_servival_on_Titanic/gender_submission.csv') #保存为待提交的csv文件
文件提交后,准确率76.076%
四、完整代码
#1.导入基础库
from random import seed
from random import randrange
from csv import reader
import copy
import pandas as pd
#1.数据预处理
#测试数据的处理
train_file='./download_datas/predict_servival_on_Titanic/train.csv'
df_train=pd.read_csv(train_file,index_col='PassengerId')
df_train['Survived_col']=df_train['Survived'] #输出结果放置在最后一列,以便于后续模型的使用
df_train.drop(['Name','Cabin','Survived','Ticket'],axis=1,inplace=True) #删除部分列表
df_train.fillna(method='ffill',inplace=True) #用前置数据填充缺失值
df_train.to_csv('./download_datas/predict_servival_on_Titanic/train_2.csv') #存储文件
#测试数据的处理
test_file='./download_datas/predict_servival_on_Titanic/test.csv'
df_test=pd.read_csv(test_file,index_col='PassengerId')
df_test['Survived']=None #输出结果占位
df_test.drop(['Name','Cabin','Ticket'],axis=1,inplace=True) #删除部分列表
df_test.fillna(method='ffill',inplace=True) #用前置数据填充缺失值
df_test.to_csv('./download_datas/predict_servival_on_Titanic/test_2.csv') #存储文件
#3.csv文件读取和转换数据类型
def csv_loader(file):
dataset=list()
with open(file,'r') as f:
csv_reader=reader(f)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
#因为后面的判断条件需要选择'<'还是'= or !=',这里只把年龄和票价进行浮点型转换
def str_to_float_converter(dataset):
dataset=dataset[1:]
for row in dataset:
row[2]=float(row[2].strip())
row[5]=float(row[5].strip())
#4.K折交叉验证拆分数据
def k_fold_split(dataset,n_folds):
dataset_split=list()
folds_size=int(len(dataset)/n_folds)
dataset_copy=list(dataset)
for fold in range(n_folds):
fold=list()
while len(fold) < folds_size:
index= randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
#5.计算准确性
def accuracy_model(actual,predicted):
correct=0
for i in range(len(actual)):
if actual[i]== predicted[i]:
correct +=1
accuracy = correct/float(len(actual)) * 100.0
return accuracy
#6.通过引用K-fold交叉检验检验模型的准确性
def test_model(dataset,algo,n_folds,*args):
dataset_split=k_fold_split(dataset,n_folds)
scores = list()
for fold in dataset_split:
train =copy.deepcopy(dataset_split)
train.remove(fold)
train=sum(train,[])
test= copy.deepcopy(fold)
predicted=algo(train,test,*args)
actual =[row[-1] for row in fold]
accuracy = accuracy_model(actual,predicted)
scores.append(accuracy)
return scores
#7.先判断是否是字符串,然后选择判断条件
#符合条件的数据放在左边的列表,不符合的放在右边的列表
def split_group(index,value,dataset):
left, right =list(), list()
for row in dataset:
if isinstance(row[index],str):
if row[index] == value:
left.append(row)
else:
right.append(row)
else:
if row[index] < value:
left.append(row)
else:
right.append(row)
groups=(left, right)
return groups
#8.计算组群的基尼系数(在筛选最小基尼系数的组群时使用)
def gini_index_groups(groups,class_value):
n_instances = float(sum([len(group) for group in groups])) #计算有多少个实例
#计算每一组的gini
gini = 0.0
for group in groups:
size = float(len(group))
if size == 0 :
continue
score = 0.0
for value in class_value:
p = [row[-1] for row in group].count(value)/size
score += p**2
gini +=(1 - score)*(size/n_instances)
return gini
#9.计算单组的基尼系数,在判定是否分裂到最末端时使用
def gini_index_group(group):
size = float(len(group))
if size == 0 :
return
class_value=list(set([row[-1] for row in group]))
gini = 0.0
score = 0.0
for value in class_value:
p = [row[-1] for row in group].count(value)/size
score += p**2
gini += 1- score
return 'T' if gini == 0.0 else gini
#10.找出最小的基尼系数,获得对应的节点
def get_node(dataset):
if len(dataset[0])==1:
return
class_value=list(set([row[-1] for row in dataset]))
posi_index, posi_value, posi_gini, posi_groups =888, 888, 888, None
for index in range(len(dataset[0])-1):
for row in dataset:
value = row[index]
groups = split_group(index,value,dataset)
gini = gini_index_groups(groups,class_value)
if gini < posi_gini :
posi_index, posi_value, posi_gini, posi_groups = index , value, gini, groups
node = {'index':posi_index, 'value':posi_value, 'groups':posi_groups}
return node
#11.设置决策树末端的值
def determined_terminal_value(group):
outconmes=[row[-1] for row in group]
terminal_value = max(set(outconmes), key =outconmes.count)
return terminal_value
#12.创建决策树的拆分
def split(node,max_depth,min_size,depth):
if not node: #如果是空值,表示没有该组
return
left, right =node['groups']
del node['groups'] #删除节点里的组群
for row_l in left:
del row_l[node['index']] #删除每组里已裂变的的索引值
for row_r in right:
del row_r[node['index']] #删除每组里已裂变的的索引值
gini_left = gini_index_group(left)
gini_right = gini_index_group(right)
if not gini_left: #部分参数已经分裂完了
pass
elif gini_left == 'T': #组的基尼系数等于0时的情况
node['left'] = determined_terminal_value(left)
elif depth == max_depth: #深度已经达到设置的最大深度
node['left'] = determined_terminal_value(left)
elif len(left) <= min_size: #数据数量已经达到设置的最小数量
node['left'] = determined_terminal_value(left)
else:
node['left'] = get_node(left) #继续分裂
split(node['left'],max_depth,min_size,depth+1)
if not node['left']:
node['left'] = determined_terminal_value(left)
if not gini_right:
pass
elif gini_right == 'T':
node['right'] = determined_terminal_value(right)
elif depth == max_depth:
node['right'] = determined_terminal_value(right)
elif len(right) <= min_size:
node['right'] = determined_terminal_value(right)
else:
node['right'] = get_node(right)
split(node['right'],max_depth,min_size,depth+1)
if not node['right']:
node['right'] = determined_terminal_value(right)
#13.创建决策树
def build_tree(train,max_depth,min_size):
node = get_node(train)
split(node,max_depth,min_size,1)
return node
#14.对一行数据进行预测的模型
def predict_one_row(row,model):
if isinstance(row[model['index']],str):
if row[model['index']] == model['value']:
predict= model['left']
del row[model['index']]
if isinstance(predict,dict):
return predict_one_row(row,model['left'])
else:
return predict
else:
predict = model['right']
del row[model['index']]
if isinstance(predict, dict):
return predict_one_row(row,model['right'])
else:
return predict
else:
if row[model['index']] < model['value']:
predict = model['left']
del row[model['index']]
if isinstance(predict, dict):
return predict_one_row(row,model['left'])
else:
return predict
else:
predict = model['right']
del row[model['index']]
if isinstance(predict, dict):
return predict_one_row(row,model['right'])
else:
return predict
#15.对整体数据进行训练和测试
def predicted(train,test,max_depth,min_size):
predicted=list()
model = build_tree(train,max_depth,min_size)
for row in test:
predict=predict_one_row(row,model)
predicted.append(predict)
return predicted
#16.对目标数据进行预测
def predict_target(dataset,target,max_depth,min_size):
predictions=list()
model = build_tree(dataset,max_depth,min_size)
for row in target:
predict=predict_one_row(row,model)
predictions.append(predict)
return predictions
#设置模型训练的文件数据
file='./download_datas/predict_servival_on_Titanic/train_2.csv'
dataset=csv_loader(file)
for row in dataset: #注意PassengerId不属于分裂的数据,需删除。
del row[0]
str_to_float_converter(dataset)
dataset=dataset[1:]
#模型和目标文件的参数均为统一
seed(3)
max_depth=100
min_size=20
n_folds=3
#模型测试
algo=predicted
scores=test_model(dataset,algo,n_folds,max_depth,min_size)
print('The scores of our model are : %s' % scores)
print('The average score of our model is : %.3f%%' % (sum(scores)/float(len(scores))))
print('------------------------------------------------------')
#设置目标文件的文件数据
file_target='./download_datas/predict_servival_on_Titanic/test_2.csv'
target=csv_loader(file_target)
for row in target:
del row[0]
str_to_float_converter(target)
target=target[1:]
#使用的预测模型
predictions=predict_target(dataset,target,max_depth,min_size)
df_sub=pd.read_csv('./download_datas/predict_servival_on_Titanic/test_2.csv')
df_sub['Survived']= predictions #预测的数据添加在最后一列
df_sub.drop(['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'],axis=1,inplace=True) #删除多余的了列表,仅保留PassengerId和Survived
df_sub.set_index('PassengerId',inplace=True)
df_sub.to_csv('./download_datas/predict_servival_on_Titanic/gender_submission.csv') #保存为待提交的csv文件