import pandas as pd
import graphviz
#1. 读取数据,获取特征值:
data = pd.read_excel(r'tietan.xls')
print(data)
#2. pclass, age, sex
feature = data.loc[:, ['pclass','age','sex']]
print(feature.info())
#3. 填充空值: AGE: 找不到准确值:
feature.fillna({'age':feature['age'].mean()}, inplace=True)
#4. 将特征中的类别属性进行转化, one-hot 编码:
X = feature.to_dict(orient='records')
print(X)
#5. 批量哑变量处理的方法:
from sklearn.feature_extraction import DictVectorizer
dic = DictVectorizer(sparse=False) # sparse=False 表示返回数组类型,否则返回词频向量化类型
x = dic.fit_transform(X) # X: [{'':, "", }] #对X 进行哑变量转换,返回的数组类型;
feature_name = dic.get_feature_names() # 返回每列的表头
# print(feature_name)
# print(x)
#6. 分割训练集与测试集:
y = data['survived']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)
#7.模型训练:
from sklearn.tree import DecisionTreeClassifier
# 实例化:
dc = DecisionTreeClassifier(criterion='entropy', max_depth=5 )
# 训练:
dc.fit(x_train, y_train)
# 预测:
dc.predict(x_test)
# 准确度:
print(dc.score(x_test, y_test))
#8. 导出模型:
from sklearn import tree
tree.export_graphviz(dc, 'tree.dot')
#9. 可视化
a = graphviz.Source('tree.dot')
print(a)
经典案例 泰坦尼克号
最新推荐文章于 2024-10-01 08:31:06 发布