再刷泰坦尼克号(2)

8 篇文章 0 订阅
2 篇文章 0 订阅

再刷泰坦尼克

特征工程

将训练集和测试集组合处理

#忽略警告提示
import warnings
warnings.filterwarnings('ignore')
#数据处理
import pandas as pd
import numpy as np
import random
#可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
path='C:/Users/Titanic/'
p1=open(path+'train.csv')
p2=open(path+'test.csv')
train=pd.read_csv(p1)
test=pd.read_csv(p2)

#合并训练集和测试集
combined=train.append(test,ignore_index=True)
combined.shape
(1309, 12)

提取乘客头衔

combined['Title']=combined['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
Title_Dictionary={
                        "Capt":       "Officer",
                        "Col":        "Officer",
                        "Major":      "Officer",
                        "Jonkheer":   "Royalty",
                        "Don":        "Royalty",
                        "Sir" :       "Royalty",
                        "Dr":         "Officer",
                        "Rev":        "Officer",
                        "the Countess":"Royalty",
                        "Dona":       "Royalty",
                        "Mme":        "Mrs",
                        "Mlle":       "Miss",
                        "Ms":         "Mrs",
                        "Mr" :        "Mr",
                        "Mrs" :       "Mrs",
                        "Miss" :      "Miss",
                        "Master" :    "Master",
                        "Lady" :      "Royalty"
}
combined['Title']=combined.Title.map(Title_Dictionary)

处理缺失值

处理年龄

从可视化分析可知,年龄分布存在群体的差异,简单的用平均值或者众数填充是不合适的

#将数据按照性别,舱位,头衔分组,并计算中位数
grouped_train=combined.head(891).groupby(['Sex','Pclass','Title'])
grouped_median_train=grouped_train.median()
grouped_test=combined.iloc[891:].groupby(['Sex','Pclass','Title'])
grouped_median_test=grouped_test.median()
grouped_median_train
AgeFareParchPassengerIdSibSpSurvived
SexPclassTitle
female1Miss30.088.250000.0369.00.01.0
Mrs40.079.200000.0499.01.01.0
Officer49.025.929200.0797.00.01.0
Royalty40.563.050000.0658.50.51.0
2Miss24.013.000000.0437.50.01.0
Mrs31.526.000000.0439.51.01.0
3Miss18.08.756250.0372.00.00.5
Mrs31.015.975001.0405.51.00.5
male1Master4.0120.000002.0446.01.01.0
Mr40.042.400000.0463.00.00.0
Officer51.035.500000.0648.00.00.0
Royalty40.027.720800.0600.00.00.0
2Master1.026.000001.0408.01.01.0
Mr31.013.000000.0440.00.00.0
Officer46.513.000000.0358.50.00.0
3Master4.028.512501.0270.53.50.0
Mr26.07.895800.0472.00.00.0
grouped_median_test
AgeFareParchPassengerIdSibSpSurvived
SexPclassTitle
female1Miss32.0158.208350.01074.00.0NaN
Mrs48.063.358300.01076.01.0NaN
Royalty39.0108.900000.01306.00.0NaN
2Miss19.524.500001.01121.01.0NaN
Mrs29.026.000000.01123.50.0NaN
3Miss22.07.879200.01090.50.0NaN
Mrs28.014.281250.51048.01.0NaN
male1Master9.5198.437502.01022.01.0NaN
Mr42.050.247900.01102.00.0NaN
Officer53.081.858300.01094.01.0NaN
2Master5.027.750001.51033.50.5NaN
Mr28.013.000000.01156.00.0NaN
Officer35.519.500000.51048.50.5NaN
3Master7.015.245801.01173.01.0NaN
Mr25.07.854200.01101.00.0NaN

因此我们可以通过乘客的性别,称谓,所属舱别的不同通过中位数来进行年龄的填充

def fillAges(row,grouped_median):
    if row['Sex']=='female' and row['Pclass']==1:
        if row['Title']=='Miss':
            return grouped_median.loc['female',1,'Miss']['Age']
        elif row['Title']=='Mrs':
            return grouped_median.loc['female',1,'Mrs']['Age']
        elif row['Title'] == 'Officer':
                return grouped_median.loc['female', 1, 'Officer']['Age']
        elif row['Title'] == 'Royalty':
                return grouped_median.loc['female', 1, 'Royalty']['Age']

    elif row['Sex']=='female' and row['Pclass'] == 2:
        if row['Title'] == 'Miss':
            return grouped_median.loc['female', 2, 'Miss']['Age']
        elif row['Title'] == 'Mrs':
            return grouped_median.loc['female', 2, 'Mrs']['Age']

    elif row['Sex']=='female' and row['Pclass'] == 3:
        if row['Title'] == 'Miss':
            return grouped_median.loc['female', 3, 'Miss']['Age']
        elif row['Title'] == 'Mrs':
            return grouped_median.loc['female', 3, 'Mrs']['Age']

    elif row['Sex']=='male' and row['Pclass'] == 1:
        if row['Title'] == 'Master':
            return grouped_median.loc['male', 1, 'Master']['Age']
        elif row['Title'] == 'Mr':
            return grouped_median.loc['male', 1, 'Mr']['Age']
        elif row['Title'] == 'Officer':
            return grouped_median.loc['male', 1, 'Officer']['Age']
        elif row['Title'] == 'Royalty':
            return grouped_median.loc['male', 1, 'Royalty']['Age']

    elif row['Sex']=='male' and row['Pclass'] == 2:
        if row['Title'] == 'Master':
            return grouped_median.loc['male', 2, 'Master']['Age']
        elif row['Title'] == 'Mr':
            return grouped_median.loc['male', 2, 'Mr']['Age']
        elif row['Title'] == 'Officer':
            return grouped_median.loc['male', 2, 'Officer']['Age']

    elif row['Sex']=='male' and row['Pclass'] == 3:
        if row['Title'] == 'Master':
            return grouped_median.loc['male', 3, 'Master']['Age']
        elif row['Title'] == 'Mr':
            return grouped_median.loc['male', 3, 'Mr']['Age']

combined.head(891).Age=combined.head(891).apply(lambda r: fillAges(r,grouped_median_train) if 
                                               np.isnan(r['Age']) else r['Age'],axis=1)
combined.iloc[891:].Age=combined.iloc[891:].apply(lambda r: fillAges(r,grouped_median_test) if
                                                 np.isnan(r['Age']) else r['Age'],axis=1)
combined.info()
combined.head(891).Fare.fillna(combined.head(891).Fare.mean(),inplace=True)
combined.iloc[891:].Fare.fillna(combined.iloc[891:].Fare.mean(),inplace=True)
填充缺失Embarked为登船地点最多的S
combined.head(891).Embarked.fillna('S', inplace=True)
combined.iloc[891:].Embarked.fillna('S', inplace=True)
填充缺失的Cabin
combined.Cabin.fillna('U', inplace=True)
combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])
combined.info()
#title虚拟变量编码
titleDf=pd.get_dummies(combined['Title'],prefix='Title')
combined=pd.concat([combined,titleDf],axis=1)
Parch&SibSp
#和上次处理一样,建立Familysize
familyDf=pd.DataFrame()
familyDf['FamilySize']=combined['Parch']+combined['SibSp']+1
familyDf[ 'Family_Single' ] = familyDf[ 'FamilySize' ].map( lambda s : 1 if s == 1 else 0 )
familyDf[ 'Family_Small' ]  = familyDf[ 'FamilySize' ].map( lambda s : 1 if 2 <= s <= 4 else 0 )
familyDf[ 'Family_Large' ]  = familyDf[ 'FamilySize' ].map( lambda s : 1 if 5 <= s else 0 )
combined=pd.concat([combined,familyDf],axis=1)
Embarked
embarkedDf=pd.get_dummies(combined['Embarked'],prefix='Embarked')
combined=pd.concat([combined,embarkedDf],axis=1)
Sex
sex_mapDict={'male':1,
            'female':0}
#map函数:对Series每个数据应用自定义的函数计算
combined['Sex']=combined['Sex'].map(sex_mapDict)
Cabin
cabinDf=pd.get_dummies(combined['Cabin'],prefix='Cabin')
combined=pd.concat([combined,cabinDf],axis=1)
Pclass
pclassDf=pd.get_dummies(combined['Pclass'],prefix='Pclass')
combined=pd.concat([combined,pclassDf],axis=1)
Ticket
#提取票价前缀,如果没有前缀,即票价为数字返回XXX
def cleanTicket(ticket):
    ticket=ticket.replace('.','')
    ticket=ticket.replace('/','')
    ticket=ticket.split()
    #ticket=map(lambda t: t.strip(),ticket)
    #flag=filter(lambda t: not t.isdigit(),ticket)
    if ticket[0].isdigit():
        return 'XXX'
    else:
        return ticket[0]

combined['Ticket']=combined['Ticket'].map(cleanTicket)
ticketsDf=pd.get_dummies(combined['Ticket'],prefix='Ticket')
combined=pd.concat([combined,ticketsDf],axis=1)
combined.head(3)
AgeCabinEmbarkedFareNameParchPassengerIdPclassSexSibSpTicket_SOTONO2Ticket_SOTONOQTicket_SPTicket_STONOTicket_STONO2Ticket_STONOQTicket_SWPPTicket_WCTicket_WEPTicket_XXX
022.0US7.2500Braund, Mr. Owen Harris013110000000000
138.0CC71.2833Cumings, Mrs. John Bradley (Florence Briggs Th…021010000000000
226.0US7.9250Heikkinen, Miss. Laina033000000100000

3 rows × 75 columns

#将其余无关特征删除
combined.drop(['PassengerId','Cabin','Embarked','Name','Pclass','Ticket','Title'], inplace=True, axis=1)
combined.head(3)
AgeFareParchSexSibSpSurvivedTitle_MasterTitle_MissTitle_MrTitle_MrsTicket_SOTONO2Ticket_SOTONOQTicket_SPTicket_STONOTicket_STONO2Ticket_STONOQTicket_SWPPTicket_WCTicket_WEPTicket_XXX
022.07.25000110.000100000000000
138.071.28330011.000010000000000
226.07.92500001.001000000100000

3 rows × 68 columns

建立模型和预测

  • 1.将数据集拆分为训练集和测试集
  • 2.使用训练集建立预测模型
  • 3.使用训练集评估模型
  • 4.使用模型得到测试集预测结果
#得到训练/测试数据
train_X=combined.iloc[:891,:].drop(['Survived'],axis=1)
target_Y=combined.iloc[:891,:]['Survived']
test_X=combined.iloc[891:,:].drop(['Survived'],axis=1)
print('训练集特征:',train_X.shape,
     '训练集标签:',target_Y.shape,
     '测试集特征:',test_X.shape)
训练集特征: (891, 67) 训练集标签: (891,) 测试集特征: (418, 67)
#导入库
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
#定义评价函数
def compute_score(clf,X,y,scoring='accuracy'):
    xval=cross_val_score(clf,X,y,cv=5,scoring=scoring)#K折交叉分类,cv数据分成的数量
    return np.mean(xval)

特征选择

一个好的特征选择可以:
* 1.减少数据之间的冗余
* 2.加速训练过程
* 3.防止过拟合

train_X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 67 columns):
Age               891 non-null float64
Fare              891 non-null float64
Parch             891 non-null int64
Sex               891 non-null int64
SibSp             891 non-null int64
Title_Master      891 non-null uint8
Title_Miss        891 non-null uint8
Title_Mr          891 non-null uint8
Title_Mrs         891 non-null uint8
Title_Officer     891 non-null uint8
Title_Royalty     891 non-null uint8
FamilySize        891 non-null int64
Family_Single     891 non-null int64
Family_Small      891 non-null int64
Family_Large      891 non-null int64
Embarked_C        891 non-null uint8
Embarked_Q        891 non-null uint8
Embarked_S        891 non-null uint8
Cabin_A           891 non-null uint8
Cabin_B           891 non-null uint8
Cabin_C           891 non-null uint8
Cabin_D           891 non-null uint8
Cabin_E           891 non-null uint8
Cabin_F           891 non-null uint8
Cabin_G           891 non-null uint8
Cabin_T           891 non-null uint8
Cabin_U           891 non-null uint8
Pclass_1          891 non-null uint8
Pclass_2          891 non-null uint8
Pclass_3          891 non-null uint8
Ticket_A          891 non-null uint8
Ticket_A4         891 non-null uint8
Ticket_A5         891 non-null uint8
Ticket_AQ3        891 non-null uint8
Ticket_AQ4        891 non-null uint8
Ticket_AS         891 non-null uint8
Ticket_C          891 non-null uint8
Ticket_CA         891 non-null uint8
Ticket_CASOTON    891 non-null uint8
Ticket_FC         891 non-null uint8
Ticket_FCC        891 non-null uint8
Ticket_Fa         891 non-null uint8
Ticket_LINE       891 non-null uint8
Ticket_LP         891 non-null uint8
Ticket_PC         891 non-null uint8
Ticket_PP         891 non-null uint8
Ticket_PPP        891 non-null uint8
Ticket_SC         891 non-null uint8
Ticket_SCA3       891 non-null uint8
Ticket_SCA4       891 non-null uint8
Ticket_SCAH       891 non-null uint8
Ticket_SCOW       891 non-null uint8
Ticket_SCPARIS    891 non-null uint8
Ticket_SCParis    891 non-null uint8
Ticket_SOC        891 non-null uint8
Ticket_SOP        891 non-null uint8
Ticket_SOPP       891 non-null uint8
Ticket_SOTONO2    891 non-null uint8
Ticket_SOTONOQ    891 non-null uint8
Ticket_SP         891 non-null uint8
Ticket_STONO      891 non-null uint8
Ticket_STONO2     891 non-null uint8
Ticket_STONOQ     891 non-null uint8
Ticket_SWPP       891 non-null uint8
Ticket_WC         891 non-null uint8
Ticket_WEP        891 non-null uint8
Ticket_XXX        891 non-null uint8
dtypes: float64(2), int64(7), uint8(58)
memory usage: 113.2 KB
#采用随机森林来计算特征输入
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
#n_estimators构造的决策树数量,max_features不超过的最大特征数量
clf=RandomForestClassifier(n_estimators=50,max_features='sqrt')
clf=clf.fit(train_X,target_Y)
features=pd.DataFrame()
features['feature']=train_X.columns
features['importance']=clf.feature_importances_ #系数大小反应特征重要性
features.sort_values(by=['importance'],ascending=True,inplace=True)
features.set_index('feature',inplace=True)
features.plot(kind='barh',figsize=(20,20))

这里写图片描述

#选取合适的特征
model=SelectFromModel(clf,prefit=True)
train_reduced=model.transform(train_X)
train_reduced.shape
(891, 13)
test_reduced=model.transform(test_X)
test_reduced.shape
(418, 13)

现在我们得到了13个特征

为了得到最佳的预测模型,需要对模型参数进行调整

run_gs=False

if run_gs:
    parameter_grid={
                'max_depth' : [4, 6, 8],
                'n_estimators': [50, 10],
                'max_features': ['sqrt', 'auto', 'log2'],
                'min_samples_split': [2, 3, 10],
                'min_samples_leaf': [1, 3, 10],
                'bootstrap': [True, False],
                }
    forest=RandomForestClassifier()
    cross_validation=StratifiedKFold(target_Y,n_folds=5)
    #使用GridSearchCV搜索最佳参数
    grid_search=GridSearchCV(forest,
                            scoring='accuracy',
                            param_grid=parameter_grid,
                            cv=cross_validation)
    grid_search.fit(train_X,target_Y)
    model=grid_search
    parameters=grid_search.best_params_

    print('Best score:{}'.format(grid_search.best_score_))
    print('Best parameters:{}'.format(grid_search.best_params_))
else:
    parameters={'bootstrap':False,'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model=RandomForestClassifier(**parameters)
    model.fit(train_X, target_Y)
compute_score(model, train_X, target_Y, scoring='accuracy')
0.8271904225390074

输出结果

output=model.predict(test_X).astype(int)
outputDf=pd.DataFrame()
outputDf['PassengerId']=test['PassengerId']
outputDf['Survived']=output
outputDf.to_csv(path+'pred.csv',index=False)
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值