编写的软件:jupyter lab
编程语言:python
代码展示
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
# 第一步:导入数据
data = pd.read_csv(r"D:\a_file\titanic\train.csv") # 前面加 r
data.info() # 展示data的基本信息
data.head() # 展示data前几行的数据,默认展示前5行
# data.head(10) # 展示data前10行的数据
# 第二步:筛选特征
# 删除对最后结果没有什么太大影响的数据列
# axis = 1 表示删除 列;inplace=True: 进行原地操作
data.drop(['Cabin',"Name", "Ticket"], inplace=True, axis = 1)
# 对于Age这一列,缺省值可以使用均值进行补充
data["Age"] = data["Age"].fillna(data["Age"].mean())
# 对于Embarked这一列,只是缺少2个数据,对结果没有太大影响,可考虑把与缺省的这两行删除掉
data = data.dropna()
# 由于决策树没有办法处理非数字的数据,需要将非数字的转化为数字。处理列--Embarked and Sex
# 处理 Embarked 这一列的,取出不重复的值
a = data["Embarked"].unique()
labels = a.tolist() # 转换为列表
data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x)) # 转化为列表相对元素的索引
data["Sex"] = (data["Sex"] == "male").astype("int") # 转化为 0 和 1
# 第三步:建立决策树
x = data.loc[:, data.columns != "Survived"]
y = data.iloc[:, data.columns == "Survived"]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size = 0.3)
# 注意:此时这些Xtrain, Xtest, Ytrain, Ytest 的索引时混乱的,最好是将索引排序好
# 索引的纠正
for i in (Xtrain, Xtest, Ytrain, Ytest):
i.index = range(i.shape[0])
clf = DecisionTreeClassifier(random_state=25)
clf = clf.fit(Xtrain, Ytrain)
score = clf.score(Xtest, Ytest)
# 第四步:画图--训练集和测试集在不同的max_depth下的趋势
tr = []
te = []
for i in range(10):
clf = DecisionTreeClassifier(random_state = 25, max_depth=i+1, criterion= "entropy")
clf = clf.fit(Xtrain, Ytrain)
score_tr = clf.score(Xtrain, Ytrain)
score_te = cross_val_score(clf, x, y, cv = 10).mean()
tr.append(score_tr)
te.append(score_te)
print(max(te))
plt.plot(range(1,11), tr,color = "red", label="train")
plt.plot(range(1,11), te,color = "blue", label="test")
plt.xticks(range(1,11))
plt.legend()
plt.show()
# 第五步:网格搜索--在多个参数下找最优
paramters = {"criterion":("gini","entropy"),
"splitter":("best","random"),
"max_depth":[*range(1,10)],
"min_samples_leaf":[*range(1,50,5)],
"min_impurity_decrease":[*np.linspace(0, 0.5, 20)]
}
clf = DecisionTreeClassifier(random_state=25)
GS = GridSearchCV(clf, paramters,cv=10)
GS = GS.fit(Xtrain, Ytrain)
GS.best_score_
GS.best_params_
数据未进行筛选之前
数据筛选之后
第四步,所画的图
第五步,最优参数组合和score