机器学习代码实验二

import pandas as pd
import sklearn.tree as tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import graphviz

# 加载数据,并显示
titanic_data = pd.read_csv("train.csv")
print(titanic_data.head(5))

# 统计缺失值并处理
print(titanic_data.isnull().sum())

# 处理数据
titanic_data.drop(["Name", "Ticket", "Cabin"], inplace=True, axis=1) # 删除缺失值过多的列与和预测的y没有关系的列
titanic_data["Age"] = titanic_data["Age"].fillna(titanic_data["Age"].mean()) # 以整体年龄的平均数填充缺失值年龄
titanic_data = titanic_data.dropna() # 将缺失值的“行”删除

# 分析数据之间的关系(Sex(性别)对生存的影响)
titanic_data.groupby(['Sex', 'Survived'])['Survived'].count() # 对性别进行分组,并计数
titanic_data[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar()
plt.xlabel('Sex')
plt.ylabel('Survival Rate')
plt.xticks([0, 1], ['Female', 'Male'], rotation=0)
plt.show()

# 分析数据之间的关系((船舱等级)对是否生存的影响)
titanic_data[['Pclass', 'Survived']].groupby(['Pclass']).mean().plot.bar()
plt.xlabel('Pclass')
plt.ylabel('Survival Rate')
plt.xticks(rotation=0)
plt.show()

# 分析数据之间的关系(Age对是否生存的影响)
bins = [0, 12, 18, 65, 100]
titanic_data['Age_group'] = pd.cut(titanic_data['Age'], bins)
by_age = titanic_data.groupby('Age_group',observed=False)['Survived'].mean()
by_age.plot(kind='bar')
plt.xlabel('Age Group')
plt.ylabel('Survival Rate')
plt.xticks(rotation=45)
plt.show()

# 分析数据之间的关系(亲友的人数和存活与否的关系)
fig, ax = plt.subplots(1, 2, figsize=(13, 5))
titanic_data[['Parch', 'Survived']].groupby(['Parch']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Parch and Survived')
titanic_data[['SibSp', 'Survived']].groupby(['SibSp']).mean().plot.bar(ax=ax[1])
ax[1].set_title('SibSp and Survived')
plt.show()

embarked_labels = titanic_data["Embarked"].unique().tolist()
titanic_data["Embarked"] = titanic_data["Embarked"].apply(lambda x: embarked_labels.index(x))
titanic_data["Sex"] = (titanic_data["Sex"] == "male").astype("int")

# 建立决策树模型,对模型进行训练,评估模型参数
features = titanic_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
labels = titanic_data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=1)
classifier = tree.DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=25, max_depth=5)
classifier.fit(X_train, y_train)
print(classifier.score(X_test, y_test))

from sklearn.model_selection import GridSearchCV
# 网格搜索
gini_thresholds = np.linspace(0, 0.5, 20)
parameters = {
"max_depth": [*range(1, 10)],
'min_samples_leaf': [*range(1, 50, 5)],
'min_impurity_decrease': [*np.linspace(0, 0.5, 20)]
}
model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, parameters, cv=10)
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

# 绘制决策树
feature_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
dot_data = tree.export_graphviz(classifier, out_file=None, feature_names=feature_names, class_names=["0", "1"],
filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render('决策树可视化')

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值