决策树
信息熵
决策树的划分依据之一 - 信息增益
sklearn决策树API
举例:
泰坦尼克号乘客生存分类模型:
决策树的结构 本地保存
用工具将.dot转换为.png
命令运行
优缺点:
用随机森林改进
集成学习方法
随机森林概念:
举例:
算法说明:
集成学习API
随机森林的优点:
code(包含决策树和随机森林):
def decision():
'''决策树对泰坦尼克号进行预测生死'''
# 获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
# 处理数据 获取特征值和目标值
x = titan[['pclass','age','sex']]
y = titan['survived']
print(x) # 特征值
# 缺失值处理
x['age'].fillna(x['age'].mean(),inplace = True)
# 分割数据集称为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 进行处理(特征工程) 特征-> 类别 -> one_hot编码
dict = DictVectorizer(sparse = False)
x_train = dict.fit_transform(x_train.to_dict(orient = "records"))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient = "records"))
# print(x_train)
# 用决策树进行预测
# dec = DecisionTreeClassifier()
# dec.fit(x_train,y_train)
#
# # 预测准确率
# print('预测的准确率:',dec.score(x_test,y_test))
# # 导出树的结构
# export_graphviz(dec,out_file="/Users/guosihan/Desktop/tree.dot",feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male'])
# 随机森林进行预测 (超参数调优)
rf = RandomForestClassifier()
param = {"n_estimators":[120,200,300,500,800,1200],"max_depth":[5,8,15,25,30]}
# 网格搜索交叉验证
gc = GridSearchCV(rf,param_grid=param,cv=2)
gc.fit(x_train,y_train)
print("准确率: ",gc.score(x_test,y_test))
print("查看选择的参数模型:",gc.best_params_) # 可以选出来5 120
return None
if __name__ == "__main__":
# knncls()
# naviebayes()
decision()