from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
从kaggle获取数据后,查看所有特征,这里只是简单的学习决策树,所以粗略的认为存活率与pclass, sex, Age相关
df = pd.read_csv("D:/data/titanic/train.csv")
# print(df.head(1))
# print(df.info())
# 处理数据
x = df.loc[:, ["Pclass", "Sex", "Age"]]
y = df.loc[:, "Survived"]
# 这里将缺失的数据简单的替换为均值
x["Age"].fillna(x["Age"].mean(), inplace=True)
# 特征工程
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 将类别转化为机器识别的one-hot编码,因为用了DictVectorizer方法,所以需要将信息转化为字典格式
dic = DictVectorizer()
x_train = dic.fit_transform(x_train.to_dict(orient="records"))
x_test = dic.transform(x_test.to_dict(orient="records"))
# 决策树预测
dec = DecisionTreeClassifier(criterion="entropy")
dec.fit(x_train, y_train)
print(dec.score(x_test, y_test))
最后得到的准确率为