将训练集和测试集组合处理
#忽略警告提示
import warnings
warnings.filterwarnings('ignore')
#数据处理
import pandas as pd
import numpy as np
import random
#可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
path='C:/Users/Titanic/'
p1=open(path+'train.csv')
p2=open(path+'test.csv')
train=pd.read_csv(p1)
test=pd.read_csv(p2)
#合并训练集和测试集
combined=train.append(test,ignore_index=True)
combined.shape
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
(1309, 12)
- 1
- 2
提取乘客头衔
combined['Title']=combined['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
Title_Dictionary={
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
combined['Title']=combined.Title.map(Title_Dictionary)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
处理缺失值
处理年龄
从可视化分析可知,年龄分布存在群体的差异,简单的用平均值或者众数填充是不合适的
#将数据按照性别,舱位,头衔分组,并计算中位数
grouped_train=combined.head(891).groupby(['Sex','Pclass','Title'])
grouped_median_train=grouped_train.median()
grouped_test=combined.iloc[891:].groupby(['Sex','Pclass','Title'])
grouped_median_test=grouped_test.median()
- 1
- 2
- 3
- 4
- 5
grouped_median_train
- 1
Age | Fare | Parch | PassengerId | SibSp | Survived | |||
---|---|---|---|---|---|---|---|---|
Sex | Pclass | Title | ||||||
female | 1 | Miss | 30.0 | 88.25000 | 0.0 | 369.0 | 0.0 | 1.0 |
Mrs | 40.0 | 79.20000 | 0.0 | 499.0 | 1.0 | 1.0 | ||
Officer | 49.0 | 25.92920 | 0.0 | 797.0 | 0.0 | 1.0 | ||
Royalty | 40.5 | 63.05000 | 0.0 | 658.5 | 0.5 | 1.0 | ||
2 | Miss | 24.0 | 13.00000 | 0.0 | 437.5 | 0.0 | 1.0 | |
Mrs | 31.5 | 26.00000 | 0.0 | 439.5 | 1.0 | 1.0 | ||
3 | Miss | 18.0 | 8.75625 | 0.0 | 372.0 | 0.0 | 0.5 | |
Mrs | 31.0 | 15.97500 | 1.0 | 405.5 | 1.0 | 0.5 | ||
male | 1 | Master | 4.0 | 120.00000 | 2.0 | 446.0 | 1.0 | 1.0 |
Mr | 40.0 | 42.40000 | 0.0 | 463.0 | 0.0 | 0.0 | ||
Officer | 51.0 | 35.50000 | 0.0 | 648.0 | 0.0 | 0.0 | ||
Royalty | 40.0 | 27.72080 | 0.0 | 600.0 | 0.0 | 0.0 | ||
2 | Master | 1.0 | 26.00000 | 1.0 | 408.0 | 1.0 | 1.0 | |
Mr | 31.0 | 13.00000 | 0.0 | 440.0 | 0.0 | 0.0 | ||
Officer | 46.5 | 13.00000 | 0.0 | 358.5 | 0.0 | 0.0 | ||
3 | Master | 4.0 | 28.51250 | 1.0 | 270.5 | 3.5 | 0.0 | |
Mr | 26.0 | 7.89580 | 0.0 | 472.0 | 0.0 | 0.0 |
grouped_median_test
- 1
Age | Fare | Parch | PassengerId | SibSp | Survived | |||
---|---|---|---|---|---|---|---|---|
Sex | Pclass | Title | ||||||
female | 1 | Miss | 32.0 | 158.20835 | 0.0 | 1074.0 | 0.0 | NaN |
Mrs | 48.0 | 63.35830 | 0.0 | 1076.0 | 1.0 | NaN | ||
Royalty | 39.0 | 108.90000 | 0.0 | 1306.0 | 0.0 | NaN | ||
2 | Miss | 19.5 | 24.50000 | 1.0 | 1121.0 | 1.0 | NaN | |
Mrs | 29.0 | 26.00000 | 0.0 | 1123.5 | 0.0 | NaN | ||
3 | Miss | 22.0 | 7.87920 | 0.0 | 1090.5 | 0.0 | NaN | |
Mrs | 28.0 | 14.28125 | 0.5 | 1048.0 | 1.0 | NaN | ||
male | 1 | Master | 9.5 | 198.43750 | 2.0 | 1022.0 | 1.0 | NaN |
Mr | 42.0 | 50.24790 | 0.0 | 1102.0 | 0.0 | NaN | ||
Officer | 53.0 | 81.85830 | 0.0 | 1094.0 | 1.0 | NaN | ||
2 | Master | 5.0 | 27.75000 | 1.5 | 1033.5 | 0.5 | NaN | |
Mr | 28.0 | 13.00000 | 0.0 | 1156.0 | 0.0 | NaN | ||
Officer | 35.5 | 19.50000 | 0.5 | 1048.5 | 0.5 | NaN | ||
3 | Master | 7.0 | 15.24580 | 1.0 | 1173.0 | 1.0 | NaN | |
Mr | 25.0 | 7.85420 | 0.0 | 1101.0 | 0.0 | NaN |
因此我们可以通过乘客的性别,称谓,所属舱别的不同通过中位数来进行年龄的填充
def fillAges(row,grouped_median):
if row['Sex']=='female' and row['Pclass']==1:
if row['Title']=='Miss':
return grouped_median.loc['female',1,'Miss']['Age']
elif row['Title']=='Mrs':
return grouped_median.loc['female',1,'Mrs']['Age']
elif row['Title'] == 'Officer':
return grouped_median.loc['female', 1, 'Officer']['Age']
elif row['Title'] == 'Royalty':
return grouped_median.loc['female', 1, 'Royalty']['Age']
elif row['Sex']=='female' and row['Pclass'] == 2:
if row['Title'] == 'Miss':
return grouped_median.loc['female', 2, 'Miss']['Age']
elif row['Title'] == 'Mrs':
return grouped_median.loc['female', 2, 'Mrs']['Age']
elif row['Sex']=='female' and row['Pclass'] == 3:
if row['Title'] == 'Miss':
return grouped_median.loc['female', 3, 'Miss']['Age']
elif row['Title'] == 'Mrs':
return grouped_median.loc['female', 3, 'Mrs']['Age']
elif row['Sex']=='male' and row['Pclass'] == 1:
if row['Title'] == 'Master':
return grouped_median.loc['male', 1, 'Master']['Age']
elif row['Title'] == 'Mr':
return grouped_median.loc['male', 1, 'Mr']['Age']
elif row['Title'] == 'Officer':
return grouped_median.loc['male', 1, 'Officer']['Age']
elif row['Title'] == 'Royalty':
return grouped_median.loc['male', 1, 'Royalty']['Age']
elif row['Sex']=='male' and row['Pclass'] == 2:
if row['Title'] == 'Master':
return grouped_median.loc['male', 2, 'Master']['Age']
elif row['Title'] == 'Mr':
return grouped_median.loc['male', 2, 'Mr']['Age']
elif row['Title'] == 'Officer':
return grouped_median.loc['male', 2, 'Officer']['Age']
elif row['Sex']=='male' and row['Pclass'] == 3:
if row['Title'] == 'Master':
return grouped_median.loc['male', 3, 'Master']['Age']
elif row['Title'] == 'Mr':
return grouped_median.loc['male', 3, 'Mr']['Age']
combined.head(891).Age=combined.head(891).apply(lambda r: fillAges(r,grouped_median_train) if
np.isnan(r['Age']) else r['Age'],axis=1)
combined.iloc[891:].Age=combined.iloc[891:].apply(lambda r: fillAges(r,grouped_median_test) if
np.isnan(r['Age']) else r['Age'],axis=1)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
combined.info()
- 1
combined.head(891).Fare.fillna(combined.head(891).Fare.mean(),inplace=True)
combined.iloc[891:].Fare.fillna(combined.iloc[891:].Fare.mean(),inplace=True)
- 1
- 2
填充缺失Embarked为登船地点最多的S
combined.head(891).Embarked.fillna('S', inplace=True)
combined.iloc[891:].Embarked.fillna('S', inplace=True)
- 1
- 2
填充缺失的Cabin
combined.Cabin.fillna('U', inplace=True)
combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])
- 1
- 2
combined.info()
- 1
#title虚拟变量编码
titleDf=pd.get_dummies(combined['Title'],prefix='Title')
combined=pd.concat([combined,titleDf],axis=1)
- 1
- 2
- 3
Parch&SibSp
#和上次处理一样,建立Familysize
familyDf=pd.DataFrame()
familyDf['FamilySize']=combined['Parch']+combined['SibSp']+1
familyDf[ 'Family_Single' ] = familyDf[ 'FamilySize' ].map( lambda s : 1 if s == 1 else 0 )
familyDf[ 'Family_Small' ] = familyDf[ 'FamilySize' ].map( lambda s : 1 if 2 <= s <= 4 else 0 )
familyDf[ 'Family_Large' ] = familyDf[ 'FamilySize' ].map( lambda s : 1 if 5 <= s else 0 )
combined=pd.concat([combined,familyDf],axis=1)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
Embarked
embarkedDf=pd.get_dummies(combined['Embarked'],prefix='Embarked')
combined=pd.concat([combined,embarkedDf],axis=1)
- 1
- 2
Sex
sex_mapDict={'male':1,
'female':0}
#map函数:对Series每个数据应用自定义的函数计算
combined['Sex']=combined['Sex'].map(sex_mapDict)
- 1
- 2
- 3
- 4
Cabin
cabinDf=pd.get_dummies(combined['Cabin'],prefix='Cabin')
combined=pd.concat([combined,cabinDf],axis=1)
- 1
- 2
Pclass
pclassDf=pd.get_dummies(combined['Pclass'],prefix='Pclass')
combined=pd.concat([combined,pclassDf],axis=1)
- 1
- 2
Ticket
#提取票价前缀,如果没有前缀,即票价为数字返回XXX
def cleanTicket(ticket):
ticket=ticket.replace('.','')
ticket=ticket.replace('/','')
ticket=ticket.split()
#ticket=map(lambda t: t.strip(),ticket)
#flag=filter(lambda t: not t.isdigit(),ticket)
if ticket[0].isdigit():
return 'XXX'
else:
return ticket[0]
combined['Ticket']=combined['Ticket'].map(cleanTicket)
ticketsDf=pd.get_dummies(combined['Ticket'],prefix='Ticket')
combined=pd.concat([combined,ticketsDf],axis=1)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
combined.head(3)
- 1
Age | Cabin | Embarked | Fare | Name | Parch | PassengerId | Pclass | Sex | SibSp | … | Ticket_SOTONO2 | Ticket_SOTONOQ | Ticket_SP | Ticket_STONO | Ticket_STONO2 | Ticket_STONOQ | Ticket_SWPP | Ticket_WC | Ticket_WEP | Ticket_XXX | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | U | S | 7.2500 | Braund, Mr. Owen Harris | 0 | 1 | 3 | 1 | 1 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 38.0 | C | C | 71.2833 | Cumings, Mrs. John Bradley (Florence Briggs Th… | 0 | 2 | 1 | 0 | 1 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 26.0 | U | S | 7.9250 | Heikkinen, Miss. Laina | 0 | 3 | 3 | 0 | 0 | … | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 rows × 75 columns
#将其余无关特征删除
combined.drop(['PassengerId','Cabin','Embarked','Name','Pclass','Ticket','Title'], inplace=True, axis=1)
- 1
- 2
combined.head(3)
- 1
Age | Fare | Parch | Sex | SibSp | Survived | Title_Master | Title_Miss | Title_Mr | Title_Mrs | … | Ticket_SOTONO2 | Ticket_SOTONOQ | Ticket_SP | Ticket_STONO | Ticket_STONO2 | Ticket_STONOQ | Ticket_SWPP | Ticket_WC | Ticket_WEP | Ticket_XXX | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | 7.2500 | 0 | 1 | 1 | 0.0 | 0 | 0 | 1 | 0 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 38.0 | 71.2833 | 0 | 0 | 1 | 1.0 | 0 | 0 | 0 | 1 | … | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 26.0 | 7.9250 | 0 | 0 | 0 | 1.0 | 0 | 1 | 0 | 0 | … | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 rows × 68 columns
建立模型和预测
- 1.将数据集拆分为训练集和测试集
- 2.使用训练集建立预测模型
- 3.使用训练集评估模型
- 4.使用模型得到测试集预测结果
#得到训练/测试数据
train_X=combined.iloc[:891,:].drop(['Survived'],axis=1)
target_Y=combined.iloc[:891,:]['Survived']
test_X=combined.iloc[891:,:].drop(['Survived'],axis=1)
print('训练集特征:',train_X.shape,
'训练集标签:',target_Y.shape,
'测试集特征:',test_X.shape)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
训练集特征: (891, 67) 训练集标签: (891,) 测试集特征: (418, 67)
- 1
- 2
#导入库
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
#定义评价函数
def compute_score(clf,X,y,scoring='accuracy'):
xval=cross_val_score(clf,X,y,cv=5,scoring=scoring)#K折交叉分类,cv数据分成的数量
return np.mean(xval)
- 1
- 2
- 3
- 4
特征选择
一个好的特征选择可以:
* 1.减少数据之间的冗余
* 2.加速训练过程
* 3.防止过拟合
train_X.info()
- 1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 67 columns):
Age 891 non-null float64
Fare 891 non-null float64
Parch 891 non-null int64
Sex 891 non-null int64
SibSp 891 non-null int64
Title_Master 891 non-null uint8
Title_Miss 891 non-null uint8
Title_Mr 891 non-null uint8
Title_Mrs 891 non-null uint8
Title_Officer 891 non-null uint8
Title_Royalty 891 non-null uint8
FamilySize 891 non-null int64
Family_Single 891 non-null int64
Family_Small 891 non-null int64
Family_Large 891 non-null int64
Embarked_C 891 non-null uint8
Embarked_Q 891 non-null uint8
Embarked_S 891 non-null uint8
Cabin_A 891 non-null uint8
Cabin_B 891 non-null uint8
Cabin_C 891 non-null uint8
Cabin_D 891 non-null uint8
Cabin_E 891 non-null uint8
Cabin_F 891 non-null uint8
Cabin_G 891 non-null uint8
Cabin_T 891 non-null uint8
Cabin_U 891 non-null uint8
Pclass_1 891 non-null uint8
Pclass_2 891 non-null uint8
Pclass_3 891 non-null uint8
Ticket_A 891 non-null uint8
Ticket_A4 891 non-null uint8
Ticket_A5 891 non-null uint8
Ticket_AQ3 891 non-null uint8
Ticket_AQ4 891 non-null uint8
Ticket_AS 891 non-null uint8
Ticket_C 891 non-null uint8
Ticket_CA 891 non-null uint8
Ticket_CASOTON 891 non-null uint8
Ticket_FC 891 non-null uint8
Ticket_FCC 891 non-null uint8
Ticket_Fa 891 non-null uint8
Ticket_LINE 891 non-null uint8
Ticket_LP 891 non-null uint8
Ticket_PC 891 non-null uint8
Ticket_PP 891 non-null uint8
Ticket_PPP 891 non-null uint8
Ticket_SC 891 non-null uint8
Ticket_SCA3 891 non-null uint8
Ticket_SCA4 891 non-null uint8
Ticket_SCAH 891 non-null uint8
Ticket_SCOW 891 non-null uint8
Ticket_SCPARIS 891 non-null uint8
Ticket_SCParis 891 non-null uint8
Ticket_SOC 891 non-null uint8
Ticket_SOP 891 non-null uint8
Ticket_SOPP 891 non-null uint8
Ticket_SOTONO2 891 non-null uint8
Ticket_SOTONOQ 891 non-null uint8
Ticket_SP 891 non-null uint8
Ticket_STONO 891 non-null uint8
Ticket_STONO2 891 non-null uint8
Ticket_STONOQ 891 non-null uint8
Ticket_SWPP 891 non-null uint8
Ticket_WC 891 non-null uint8
Ticket_WEP 891 non-null uint8
Ticket_XXX 891 non-null uint8
dtypes: float64(2), int64(7), uint8(58)
memory usage: 113.2 KB
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
#采用随机森林来计算特征输入
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
#n_estimators构造的决策树数量,max_features不超过的最大特征数量
clf=RandomForestClassifier(n_estimators=50,max_features='sqrt')
clf=clf.fit(train_X,target_Y)
- 1
- 2
- 3
- 4
- 5
- 6
features=pd.DataFrame()
features['feature']=train_X.columns
features['importance']=clf.feature_importances_ #系数大小反应特征重要性
features.sort_values(by=['importance'],ascending=True,inplace=True)
features.set_index('feature',inplace=True)
- 1
- 2
- 3
- 4
- 5
features.plot(kind='barh',figsize=(20,20))
- 1
#选取合适的特征
model=SelectFromModel(clf,prefit=True)
train_reduced=model.transform(train_X)
train_reduced.shape
- 1
- 2
- 3
- 4
(891, 13)
- 1
- 2
test_reduced=model.transform(test_X)
test_reduced.shape
- 1
- 2
(418, 13)
- 1
- 2
现在我们得到了13个特征
为了得到最佳的预测模型,需要对模型参数进行调整
run_gs=False
if run_gs:
parameter_grid={
'max_depth' : [4, 6, 8],
'n_estimators': [50, 10],
'max_features': ['sqrt', 'auto', 'log2'],
'min_samples_split': [2, 3, 10],
'min_samples_leaf': [1, 3, 10],
'bootstrap': [True, False],
}
forest=RandomForestClassifier()
cross_validation=StratifiedKFold(target_Y,n_folds=5)
#使用GridSearchCV搜索最佳参数
grid_search=GridSearchCV(forest,
scoring='accuracy',
param_grid=parameter_grid,
cv=cross_validation)
grid_search.fit(train_X,target_Y)
model=grid_search
parameters=grid_search.best_params_
print('Best score:{}'.format(grid_search.best_score_))
print('Best parameters:{}'.format(grid_search.best_params_))
else:
parameters={'bootstrap':False,'min_samples_leaf': 3, 'n_estimators': 50,
'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
model=RandomForestClassifier(**parameters)
model.fit(train_X, target_Y)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
compute_score(model, train_X, target_Y, scoring='accuracy')
- 1
0.8271904225390074
- 1
- 2
输出结果
output=model.predict(test_X).astype(int)
outputDf=pd.DataFrame()
outputDf['PassengerId']=test['PassengerId']
outputDf['Survived']=output
outputDf.to_csv(path+'pred.csv',index=False)