Titanic: Machine Learning from Disaster_01

一、数据预处理

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

1 导入数据

trainfile = '../input/titanic/train.csv'
testfile = '../input/titanic/test.csv'

train = pd.read_csv(trainfile)
test = pd.read_csv(testfile)
data_all = pd.concat([train,test])  # 连接训练集和测试集,统一处理
data_all.info()
data_all.isna().sum()

2 数据探索

2.1 定类、定序数据分析

f, [ax1, ax2, ax3] = plt.subplots(1, 3, figsize=(20, 4))
sns.countplot(x='Pclass', hue='Survived', data=data_all, ax=ax1)
sns.countplot(x='Sex', hue='Survived', data=data_all, ax=ax2)
sns.countplot(x='Embarked', hue='Survived', data=data_all, ax=ax3)

ax1.set_title('Feature analysis-Pclass ')
ax2.set_title('Feature analysis-Sex')
ax3.set_title('Feature analysis-Embarked')
f.suptitle('Analysis of classified/ordered data', size=20, y=1.1)  # 指定子图的标题

2.2 定距数据分析

f, [ax1, ax2, ax3, ax4] = plt.subplots(1, 4, figsize=(20, 4))
sns.countplot(x='SibSp', hue='Survived', data=data_all, ax=ax1)
sns.countplot(x='Parch', hue='Survived', data=data_all, ax=ax2)
sns.violinplot(x='Survived',y='Fare',data=data_all, ax=ax3)
sns.violinplot(x='Survived',y='Age',data=data_all, ax=ax4)

ax1.set_title('Feature analysis-SibSp')
ax2.set_title('Feature analysis-Parch')
ax3.set_title('Feature analysis-Fare')
ax4.set_title('Feature analysis-Age')

f.suptitle('Analysis of distance data', size=20, y=1.1)  # 指定子图的标题

3 数据清洗

data_all.info()
data_all.isna().sum()

3.1 Name、Age的处理

data_all['Title']=data_all['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())  # 取名字中的Title。ps:strip()去除前后的空格
data_all.Title.value_counts()
pd.crosstab(data_all.Title,data_all.Sex)  #  Title 和 Sex的交叉表(结果显示只有“Dr”,“title”都属于2种性别)
data_all[(data_all.Title=='Dr')&(data_all.Sex=='female')]  # 找到“Dr”为“female”这条数据(结果显示female“ Dr”的PassengerId为“ 797”)

 

# 映射其他“Title”
m={'Capt':'Rareman', 
    'Col':'Rareman',
    'Don':'Rareman',
    'Dona':'Rarewoman',
    'Dr':'Rareman',
    'Jonkheer':'Rareman',
    'Lady':'Rarewoman',
    'Major':'Rareman',
    'Master':'Master',
    'Miss':'Miss',
    'Mlle':'Rarewoman',
    'Mme':'Rarewoman',
    'Mr':'Mr',
    'Mrs':'Mrs',
    'Ms':'Rarewoman',
    'Rev':'Mr',
    'Sir':'Rareman',
    'the Countess':'Rarewoman'
    }
data_all.Title=data_all.Title.map(m)
data_all.loc[data_all.PassengerId==797,'Title']='Rarewoman'  # 将 female 'Dr' 的Title映射为 'Rarewoman'

data_all.Title.value_counts()
print(data_all[data_all.Title=='Mr']['Age'].describe())
print(data_all[data_all.Title=='Miss']['Age'].describe()) # 可以看出其中包含女孩
print(data_all[data_all.Title=='Mrs']['Age'].describe())
print(data_all[data_all.Title=='Master']['Age'].describe())  # 可以看出主要是男孩
print(data_all[data_all.Title=='Rareman']['Age'].describe())
print(data_all[data_all.Title=='Rarewoman']['Age'].describe())
# 孩子往往有较高的成活率,构造属性Girl
data_all.Age.fillna(999,inplace=True)

def girl(data_all):
    # 1)对于具有年龄记录的“Miss”,我们可以简单地确定“Miss”是否是按年龄划分的小女孩
    if (data_all.Age!=999)&(data_all.Title=='Miss')&(data_all.Age<=14):
        return 'Girl'
    # 2)对于没有年龄记录的“Miss”,我们使用(Parch!= 0),因为如果是小女孩,她很可能会由父母陪伴
    elif (data_all.Age==999)&(data_all.Title=='Miss')&(data_all.Parch!=0):
        return 'Girl'
    else:
        return data_all.Title
data_all['Title']=data_all.apply(girl,axis=1)

Tit=['Mr','Miss','Mrs','Master','Girl','Rareman','Rarewoman']
for i in Tit:
    data_all.loc[(data_all.Age==999)&(data_all.Title==i),'Age']=data_all.loc[data_all.Title==i,'Age'].median()

3.2 Embarked的处理

# 因为Embarked的缺失值较少,所以采用众数进行填充
data_all.groupby('Embarked')['Embarked'].count()  # 对Embarked列分类计数,结果显示众数是"S"
data_all['Embarked'].fillna('S',inplace=True)  # 用"S"填充

3.3 Fare的处理

data_all[data_all.Fare.isnull()]  # 结果显示缺失值来自Pclass = 3的乘客
data_all.Fare.fillna(data_all[data_all.Pclass==3]['Fare'].median(),inplace=True)  # 用Pclass = 3的中位数票价来填补缺失的值
# 再次查看数据形态
data_all.info()
data_all.isna().sum()

4 属性规约

# 删除属性:PassengerId(没什么关系)、Ticket、Name、Cabin(缺失值太多)
data_cleaned = data_all.drop(columns=['PassengerId','Ticket','Name','Ticket','Cabin'])
data_cleaned.to_csv('data_cleaned.csv',index=False)

5 重新分离数据集

data_train = data_cleaned[:891]
x_train = data_train.drop(columns=['Survived'])
y_train = data_train.Survived
x_test = data_cleaned[891:].drop(columns=['Survived'])

6 数据变换

6.1 one-hot编码

x_train_s = pd.get_dummies(x_train)
x_test_s = pd.get_dummies(x_test)

6.2 对Age、Fare进行标准化

x_train_s.Age = (x_train_s.Age - x_train_s.Age.mean()) / x_train_s.Age.std()
x_train_s.Fare = (x_train_s.Fare - x_train_s.Fare.mean()) / x_train_s.Fare.std()

x_test_s.Age = (x_test_s.Age - x_test_s.Age.mean()) / x_test_s.Age.std()
x_test_s.Fare = (x_test_s.Fare - x_test_s.Fare.mean()) / x_test_s.Fare.std()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值