![a61e61bd2d98c82abe0fda092a31e0c5.png](https://img-blog.csdnimg.cn/img_convert/a61e61bd2d98c82abe0fda092a31e0c5.png)
matplotlib圆饼图、直方图、箱型图、柱状图, seaborn柱状图、小提琴图,常用函数:
cad_draw1 = train_data['Survived'].value_counts()
plt.axis('equal')
plt.pie(cad_draw1,
explode = [0.1,0],
labels = ['死亡','生存'],
colors=['r', 'g'],
autopct='%.2f%%',
pctdistance=0.6,
labeldistance = 1.1,
shadow = True,
startangle=0,
radius=1.5,
frame=False)
n = 1
plt.savefig('存活圆饼图_%d.png' % n, dpi=200)
一,导入模块.读取数据
二,x=live占比dead占比 matplotlib圆饼图
三,x=性别, y=live占比 seaborn柱状图
四,x=客舱等级, y=live占比 seaborn柱状图
五,x=各客舱等级(性别), y=live占比 seaborn柱状图
六,x=年龄 matplotlib直方图,箱型图
七,x=年龄 y=live占比 seaborn柱状图
八,x=船样等级(dead或live), y=年龄 seaborn小提琴图
x=性别 (dead或live), y=年龄 seaborn小提琴图
九,x=有同乘的兄弟姐妹、配偶live占比dead占比 matplotlib圆饼图
x=无同乘的兄弟姐妹、配偶live占比dead占比 matplotlib圆饼图
x=有同乘的父母/小孩 live占比dead占比 matplotlib圆饼图
x=无同乘的父母/小孩live占比dead占比 matplotlib圆饼图
十,x=同乘的兄弟姐妹/配偶数, y=live占比 matplotlib柱状图
x=同乘的父母/小孩数, y=live占比 matplotlib柱状图
x=同乘的兄弟姐妹/配偶/父母/小孩数,y=live占比 matplotlib柱状图
十一, x=票价 matplotlib直方图,箱型图
x=dead/live , y=平均票价 matplotlib 柱状图
十二, 利用KNN分类模型,对结果进行预测
一,导入模块.读取数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
# 导入时间模块
import warnings
warnings.filterwarnings('ignore')
# 不发出警告
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams['font.serif'] = ['KaiTi']
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题,或者转换负号为字符串,中文不显
# 读取数据
os.chdir('C:/Users/Administrator/Desktop/train_S1111111111111/')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
![ca346ab63e725a614bef979935eadfd8.png](https://img-blog.csdnimg.cn/img_convert/ca346ab63e725a614bef979935eadfd8.png)
二,x=live占比dead占比 matplotlib圆饼图
sns.set()
sns.set_style("ticks")
plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文字体设置-黑体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
sns.set(font='SimHei') # 解决Seaborn中文显示问题
cad_draw1 = train_data['Survived'].value_counts()
plt.axis('equal')
plt.pie(cad_draw1,
explode = [0.1,0],
labels = ['死亡','生存'],
colors=['r', 'g'],
autopct='%.2f%%',
pctdistance=0.6,
labeldistance = 1.1,
shadow = True,
startangle=0,
radius=1.5,
frame=False)
n = 1
plt.savefig('存活圆饼图_%d.png' % n, dpi=200)
![940e108d12d0f8a45d7c7d197dd6df7e.png](https://img-blog.csdnimg.cn/img_convert/940e108d12d0f8a45d7c7d197dd6df7e.png)
存活比例为38.38%
三,x=性别,y=live占比 Seaborn柱状图
# 男性和女性存活情况
cad_draw2 = train_data[['Sex','Survived']].groupby(['Sex']).mean().reset_index()
sns.barplot(x="Sex", y="Survived",
# hue="class",
data=cad_draw2,
# palette = 'hls',
palette = ['r','g'],
order = ['male','female'], # 筛选类别
capsize = 0.05, # 误差线横向延伸宽度
saturation=1, # 颜色饱和度
errcolor = 'gray',errwidth = 2, # 误差线颜色,宽度
ci = 'sd' # 置信区间误差 → 0-100内值、'sd'、None
)
survive_sex = train_data.groupby(['Sex','Survived'])['Survived'].count()
print(survive_sex)
# 女性生存率较高
print('女性存活率为%.2f%%,男性存活率为%.2f%%' %
(survive_sex.loc['female',1]/survive_sex.loc['female'].sum()*100,
survive_sex.loc['male',1]/survive_sex.loc['male'].sum()*100))
![1d2bcf03a3c8654c3df9b8754c90a53e.png](https://img-blog.csdnimg.cn/img_convert/1d2bcf03a3c8654c3df9b8754c90a53e.png)
![ae46fcfff5ccf812ae3e62eade6e3aec.png](https://img-blog.csdnimg.cn/img_convert/ae46fcfff5ccf812ae3e62eade6e3aec.png)
女性存活率为74.20%,男性存活率为18.89%
四,x=客舱等级,y=live占比 matplotlib柱状图
cad_draw3 = train_data[['Pclass','Survived']].groupby(['Pclass']).mean().reset_index()
sns.barplot(x="Pclass", y="Survived",
# hue="class",
data=cad_draw3,
palette = 'hls',
# palette = ['r','g'],
# order = ['male','female'], # 筛选类别
capsize = 0.05, # 误差线横向延伸宽度
saturation=1, # 颜色饱和度
errcolor = 'gray',errwidth = 2, # 误差线颜色,宽度
ci = 'sd' # 置信区间误差 → 0-100内值、'sd'、None
)
survive_Pclass = train_data.groupby(['Pclass','Survived'])['Survived'].count()
print(survive_Pclass)
# 一等客舱生存率较高
print('一等客舱存活率为%.2f%%,二等存活率为%.2f%%,三等存活率为%.2f%%' %
(survive_Pclass.loc[1,1]/survive_Pclass.loc[1].sum()*100,
survive_Pclass.loc[2,1]/survive_Pclass.loc[2].sum()*100,
survive_Pclass.loc[3,1]/survive_Pclass.loc[3].sum()*100))
![f952ca7f57262e1d1233987614a85895.png](https://img-blog.csdnimg.cn/img_convert/f952ca7f57262e1d1233987614a85895.png)
![3577d0c7a3ec090d6419695727cd2008.png](https://img-blog.csdnimg.cn/img_convert/3577d0c7a3ec090d6419695727cd2008.png)
一等客舱存活率为62.96%,二等存活率为47.28%,三等存活率为24.24%
五,x=客舱等级,y=live占比 matplotlib柱状图
cad_draw5 = train_data.groupby(['Sex','Pclass']).mean()['Survived'].reset_index()
sns.barplot(x="Pclass", y="Survived",
hue="Sex",
data=cad_draw4,
# palette = 'hls',
palette = ['g','r'],
# order = ['male','female'], # 筛选类别
# capsize = 0.05, # 误差线横向延伸宽度
saturation=1, # 颜色饱和度
# errcolor = 'gray',errwidth = 2, # 误差线颜色,宽度
# ci = 'sd' # 置信区间误差 → 0-100内值、'sd'、None
)
![74677224aaacfa464cf1f78fc15b1fa9.png](https://img-blog.csdnimg.cn/img_convert/74677224aaacfa464cf1f78fc15b1fa9.png)
六,x=年龄 matplotlib直方图,箱型图
#age字段去空值
train_data_age = train_data[train_data['Age'].notnull()]
#年龄直方图
plt.figure(figsize=(12,5))
plt.subplot(121)
train_data_age['Age'].hist(bins=70)
plt.xlabel('Age')
plt.ylabel('Num')
#年龄箱型图
plt.subplot(122)
train_data.boxplot(column='Age',showfliers=False)
train_data_age['Age'].describe()
![bb5159f9e2605d45e4e0bebbde2cd5d3.png](https://img-blog.csdnimg.cn/img_convert/bb5159f9e2605d45e4e0bebbde2cd5d3.png)
![108f2607293add053ba46d499326a3fb.png](https://img-blog.csdnimg.cn/img_convert/108f2607293add053ba46d499326a3fb.png)
总体年龄分布: 去掉缺失值后样本有714,平均年龄约为30岁,标准差14岁,最小年龄0.42,最大年龄80
七,x=年龄,y=live占比 seaborn柱状图
train_data_age['Age_int'] = train_data_age['Age'].astype(int)
average_age = train_data_age[["Age_int", "Survived"]].groupby(['Age_int'],as_index=False).mean()
plt.figure(figsize=(18,4))
sns.barplot(x="Age_int", y="Survived",
# hue="class",
data=average_age,
palette = 'hls',
# palette = ['r','g'],
# order = ['male','female'], # 筛选类别
capsize = 0.05, # 误差线横向延伸宽度
saturation=1, # 颜色饱和度
# errcolor = 'gray',errwidth = 2, # 误差线颜色,宽度
# ci = 'sd' # 置信区间误差 → 0-100内值、'sd'、None
)
![349e3300f0cb543960400559742189a4.png](https://img-blog.csdnimg.cn/img_convert/349e3300f0cb543960400559742189a4.png)
灾难中,老人和小孩存活率较高
八,x=船样等级(dead或live),y=年龄 seaborn小提琴图
x=性别 (dead或live),y=年龄 seaborn小提琴图
fig,ax = plt.subplots(1,2, figsize = (18,8))
sns.violinplot("Pclass","Age",hue="Survived",data=train_data_age,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age",hue="Survived",data=train_data_age,split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
![f482c2c85eb0a4a6e6ec9a9c6a87580a.png](https://img-blog.csdnimg.cn/img_convert/f482c2c85eb0a4a6e6ec9a9c6a87580a.png)
按照不同船舱等级划分 → 船舱等级越高,存活者年龄越大,船舱等级1存活年龄集中在20-40岁,船舱等级2/3中有较多低龄乘客存活
![e73c85356397ec33b34f06a35068504e.png](https://img-blog.csdnimg.cn/img_convert/e73c85356397ec33b34f06a35068504e.png)
按照性别划分 → 男性女性存活者年龄主要分布在20-40岁,且均有较多低龄乘客,其中女性存活更多
九,x=有同乘的兄弟姐妹、配偶live占比dead占比 matplotlib圆饼图
x=无同乘的兄弟姐妹、配偶live占比dead占比 matplotlib圆饼图
x=有同乘的父母/小孩 live占比dead占比 matplotlib圆饼图
x=无同乘的父母/小孩live占比dead占比 matplotlib圆饼图
sibsp_df = train_data[train_data['SibSp'] != 0]
no_sibsp_df = train_data[train_data['SibSp'] == 0]
# 筛选出有无兄弟姐妹数据
parch_df = train_data[train_data['Parch'] != 0]
no_parch_df = train_data[train_data['Parch'] == 0]
# 筛选出有无父母子女数据
plt.figure(figsize=(12,3))
plt.subplot(141)
plt.axis('equal')
sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct= '%1.1f%%',colormap = 'Blues')
plt.xlabel('sibsp')
plt.subplot(142)
plt.axis('equal')
no_sibsp_df['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct= '%1.1f%%',colormap = 'Blues')
plt.xlabel('no_sibsp')
plt.subplot(143)
plt.axis('equal')
parch_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct= '%1.1f%%',colormap = 'Reds')
plt.xlabel('parch')
plt.subplot(144)
plt.axis('equal')
no_parch_df['Survived'].value_counts().plot.pie(labels=['No Survived', 'Survived'], autopct = '%1.1f%%',colormap = 'Reds')
plt.xlabel('no_parch')
![a5fafb9b465079b5428c5686fc6bcb76.png](https://img-blog.csdnimg.cn/img_convert/a5fafb9b465079b5428c5686fc6bcb76.png)
有兄弟姐妹、父母子女的乘客存活率更大
十,x=同乘的兄弟姐妹/配偶数, y=live占比 matplotlib柱状图
x=同乘的父母/小孩数, y=live占比 matplotlib柱状图
x=同乘的兄弟姐妹/配偶/父母/小孩数,y=live占比 matplotlib柱状图
fig, ax=plt.subplots(1,2,figsize=(15,4))
train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Parch and Survived')
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar(ax=ax[1])
ax[1].set_title('SibSp and Survived')
# 查看兄弟姐妹个数与存活率
train_data['Family_Size'] = train_data['Parch'] + train_data['SibSp']+1
train_data[['Family_Size','Survived']].groupby(['Family_Size']).mean().plot.bar(figsize = (15,4))
# 查看父母子女个数与存活率
print('若独自一人,那么其存活率比较低;但是如果亲友太多的话,存活率也会很低')
![0bdd6fae2923c4b956cf2b3932019a0f.png](https://img-blog.csdnimg.cn/img_convert/0bdd6fae2923c4b956cf2b3932019a0f.png)
若独自一人,那么其存活率比较低;但是如果亲友太多的话,存活率也会很低
十一,x=票价 matplotlib 直方图,matplotlib箱型图
x=dead/live ,y=平均票价 matplotlib柱状图
# 票价分布和存活与否的关系
fig, ax=plt.subplots(1,2,figsize=(15,4))
train_data['Fare'].hist(bins=70, ax = ax[0])
train_data.boxplot(column='Fare', by='Pclass', showfliers=False,ax = ax[1])
# 查看票价分布情况
fare_not_survived = train_data['Fare'][train_data['Survived'] == 0]
fare_survived = train_data['Fare'][train_data['Survived'] == 1]
# 基于票价,筛选出生存与否的数据
average_fare = pd.DataFrame([fare_not_survived.mean(),fare_survived.mean()])
std_fare = pd.DataFrame([fare_not_survived.std(),fare_survived.std()])
average_fare.plot(yerr=std_fare,kind='bar',legend=False,figsize = (15,4),grid = True)
# 查看票价与是否生还的关系
print('生还者的平均票价要大于未生还者的平均票价')
![9c4173263cc9873bb994996088bfe876.png](https://img-blog.csdnimg.cn/img_convert/9c4173263cc9873bb994996088bfe876.png)
十二,利用KNN分类模型,对结果进行预测
'''
5、利用KNN分类模型,对结果进行预测
要求:
① 模型训练字段:'Survived','Pclass','Sex','Age','Fare','Family_Size'
② 模型预测test.csv样本数据的生还率
提示:
① 训练数据集中,性别改为数字表示 → 1代表男性,0代表女性
'''
# 数据清洗,提取训练字段
knn_train = train_data[['Survived','Pclass','Sex','Age','Fare','Family_Size']].dropna()
knn_train['Sex'][knn_train['Sex'] == 'male'] = 1
knn_train['Sex'][knn_train['Sex'] == 'female'] = 0
test_data['Family_Size'] = test_data['Parch'] + test_data['SibSp']+1
knn_test = test_data[['Pclass','Sex','Age','Fare','Family_Size']].dropna()
knn_test['Sex'][knn_test['Sex'] == 'male'] = 1
knn_test['Sex'][knn_test['Sex'] == 'female'] = 0
print('清洗后训练集样本数据量为%i个' % len(knn_train))
knn_train.head()
print('清洗后测试集样本数据量为%i个' % len(knn_test))
knn_test.head()
# 模型预测test.csv样本数据的生还率
from sklearn import neighbors
# 导入KNN分类模块
knn = neighbors.KNeighborsClassifier()
knn.fit(knn_train[['Pclass','Sex','Age','Fare','Family_Size']], knn_train['Survived'])
# 构建模型
knn_test['predict'] = knn.predict(knn_test)
pre_survived = knn_test[knn_test['predict'] == 1].reset_index()
del pre_survived['index']
# 预测存货情况
print('finished!')
pre_survived