python创建决策树_python 决策树建立泰坦尼克号

最新推荐文章于 2023-02-28 23:50:18 发布

weixin_39750854

最新推荐文章于 2023-02-28 23:50:18 发布

阅读量197

点赞数

文章标签： python创建决策树

### 泰坦尼克号海难生存人员预测

# 导入需要的库

import pandas as pd

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_predict

import matplotlib.pyplot as plt

data = pd.read_csv(r'file:///E:/学习/python/机器学习课件 9.20-9.21/titanic/train.csv',index_col = 0)

# 查看数据集的基本特征

data.head()

data.info

data.shape

#不涉及到训练集和测试集之间相互影响的

#删除缺失值过多的列，和观察判断来说和预测的y没有关系的列

data.drop(["Cabin","Name","Ticket"],inplace=True,axis=1)

data.info

data.head()

# 对数据集中的缺失值进行处理年龄中缺失值这里用均值进行填补

data.loc[:,'Age'] = data.loc[:,'Age'].fillna(data.loc[:,'Age'].mean())

# 对缺失数据少的数据直接删除

data = data.dropna()

data.info

# =============================================================================

# #将分类变量转换为数值变量

# # 将二分类变量转化为0，1变量

# # astype能够轻松的将pandas中文本变量转换为数值型变量

# data['Sex'] = (data['Sex'] == 'male').astype("int")

# data.head()

# =============================================================================

labels=data['Sex'].unique().tolist()

# 得到去重之后的数据将其转换为列表其各个数值所对应的数据的索引分别为0，1

# 我们用函数将其索引赋值给对应的数据，即实现了对其的离散化和数值化

data['Sex']=[*map(lambda x:labels.index(x),data['Sex'])]

# 将三分类的问题转化为数值变量

labels = data['Embarked'].unique().tolist() # 对数据列进行去重并将其转换为列表格式

data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x))

# 提取标签和特征矩阵分裂数据集 -- 测试集合训练集

X = data.iloc[:,1:]

Y = data.iloc[:,1]

# 导入数据划分的包

from sklearn.model_selection import train_test_split

# 注意这里的X_train,X_test,y_train,y_test的顺序不能颠倒

X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3,random_state = 420)

# 首先修正分割后的索引

for i in [X_train,y_train,X_test,y_test]:

i.index = range(i.shape[0])

# =============================================================================

# # 对数据集进行数据预处理处理缺失值和异常值

# X_train.info # 发现年龄这一列中有缺失值这里用均值进行填补

# X_train.loc[:,'Age'] = X_train.Age.fillna(X_train.loc[:,'Age'].mean())

# X_test.loc[:,'Age'] = X_test.Age.fillna(X_test.loc[:,'Age'].mean())

# =============================================================================

# 建立预测模型

# 实例化模型

clf = DecisionTreeClassifier(random_state=25)

# 利用训练数据集对实例化后的数据进行训练

clf = clf.fit(X_train,y_train)

# 查看训练后的模型分别在训练集和测试集上面的得分

clf.score(X_train,y_train)

clf.score(X_test,y_test)

clf.feature_importances_

# 显示出各个特征属性所对应的名称以及重要性

[*zip(data.columns,clf.feature_importances_)]

from sklearn.model_selection import cross_val_score

var = cross_val_score(clf,Xtrain,Ytrain,cv=10).var()

var

# 调整树的深度观察模型的拟合程度

tr = []

te = []

test = []

# 设置树的深度在0到10之间

for i in range(1,10):

clf = DecisionTreeClassifier(max_depth = i,random_state= 666)

clf = clf.fit(X_train,y_train)

score_train = clf.score(X_train,y_train)

score_te = cross_val_score(clf,Xtrain,Ytrain,cv=10).mean()

tr.append(score_train)

te.append(score_te)

# 模型在测试集上面的准确率

score_test = clf.score(X_test, y_test)

test.append(score_test)

print(len(test))

print(max(te))

len(tr)

# 切换绘图风格

plt.style.use("ggplot")

plt.plot(range(1,10),tr,color="red",label="train")

plt.plot(range(1,10),te,color="blue",label="cross_val_score")

plt.plot(range(1,10),test,color="green",label="test")

plt.xticks(range(1,10))

plt.legend()

plt.show()

# 用网格搜索调整参数

from sklearn.model_selection import GridSearchCV

import numpy as np

gini_thresholds = np.linspace(0,0.5,20)

gini_thresholds

# 首先构造一个所有搜索参数的字典

parameters = {'splitter':('best','random')

,'criterion':("gini","entropy")

,"max_depth":range(1,10)

,'min_samples_leaf':range(1,50,5)

,'min_impurity_decrease':np.linspace(0,0.5,20)

}

# 实例化模型, 先不传参

clf = DecisionTreeClassifier(random_state=25)

# 实例化网格搜索API

GS = GridSearchCV(clf,parameters, cv = 5 , verbose=1 )

# 对数据进行网格搜索

GS.fit(Xtrain, Ytrain)

#属性best_params_查看调整出来的最佳参数

GS.best_params_

#属性best_score_查看最佳分数

GS.best_score_

weixin_39750854

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python创建决策树_python 决策树建立泰坦尼克号

### 泰坦尼克号海难生存人员预测# 导入需要的库import pandas as pdfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.model_selection import train_test_splitfrom sklearn.model_selection import GridSearchCVfrom skl...
复制链接

扫一扫