数据为泰坦尼克号人员数据,预测是否生还
使用sklearn.tree.DecisionTreeClassifier生成分类决策树
利用GridSearchCV选择最优参数组合
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
def read_data(file_path):
data = pd.read_csv(file_path, index_col=0)
print(type(data))
print(data.shape)
print(data.head())
print(data.index)
print(data.columns)
#删除无用列,inplace=True会将列完全删除,不然只是将对应列值置为空
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
#性别转换为0 1
data['Sex'] = (data['Sex'] == 'male').astype('int')
# 处理登船港口数据 'S', 'C', 'Q', nan 分别对应 0 1 2 3
labels = data['Embarked'].unique().tolist()#['S', 'C', 'Q', nan]
data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n))