本文是博主基于之前练手Kaggle上泰坦尼克的入门分析而做的个人总结。
此案例是读者经过研究多个Kaggle上大神的kernel经验,加上个人的理解,得到 的结果
此案例的亮点在于数据探索部分及后续的模型优化上,博主第一次见识到了数据探索时利用seaborn和go包画出的各种数据分析图像的美腻。
话不多说~开始~
0 简介
关于这个案例,具体的介绍及简介,见Kaggle官网上的数据,内容很全,唯一一个要稍稍提到的是,官网上的关于变量的解释均是英文的,作为英文有些坎坷的我来说,通常做Kaggle上的案例时,我都会谨慎的仔细的理解各个变量的意思。
这是官网上这个案例的链接地址点击打开链接
1 数据探索
往往数据探索的时候需要进行二次探索,即进行预处理过后,再次进行探索。
第一次探索
1.1 总体情况分析
首先读取数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
将数据的基本描述存入Excel表中,由于jupyter notebook上显示不全
explore = train.describe(include = 'all').T
explore['null'] = len(train) - explore['count']
explore.insert(0,'dtype',train.dtypes)
explore.T.to_csv('explore1.csv')
探索数据时,由于总体数据量较少,此处将训练集和测试集进行连接,探索
alldata = pd.concat([train.ix[:,'Pclass':'Embarked'],test.ix[:,'Pclass':'Embarked']]).reset_index(drop=True)
explore = alldata.describe(include = 'all').T
explore['null'] = len(alldata) - explore['count']
explore.insert(0,'dtype',alldata.dtypes)
explore.T.to_csv('explore2.csv')
1.2 数据质量分析
1.2.1 缺失值
自定义缺失函数
def missing_values(alldata):
alldata_na = pd.DataFrame(alldata.isnull().sum(), columns={'missingNum'})
alldata_na['missingRatio'] = alldata_na['missingNum']/len(alldata)*100
alldata_na['existNum'] = len(alldata) - alldata_na['missingNum']
alldata_na['train_notna'] = len(train) - train.isnull().sum()
alldata_na['test_notna'] = alldata_na['existNum'] - alldata_na['train_notna']
alldata_na['dtype'] = alldata.dtypes
alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
alldata_na.set_index('index',inplace=True)
return alldata_na
alldata_na = missing_values(alldata)
alldata_na
1.2.2 重复值
train[train.duplicated()==True] # 无重复值
运行结果为空,无重复值
1.2.3 异常值
1、简单统计量分析;
2、3sigma原则:一组测定值中与平均值的偏差超过3倍标准差的值;
3、箱型图分析Ql+/-1.5QL
train.boxplot()
plt.ylim(0,1000)
train[train.Fare>400]
#这部分数据的最后都是存活,暂时可以推测,fare值越高可能存活率越高,暂时未见异常值
1.3 数据特征分析
1.3.1 分布分析
fig, ax = plt.subplots(2, 2, figsize = (8, 6))
sns.countplot(x='Pclass',data=train, ax = ax[0,1])
sns.countplot(x='Survived',data=train, ax = ax[1,1])
sns.violinplot('Survived', 'Age', data = train, ax = ax[1,0]).set(ylim = (-10, 80))
sns.countplot('Embarked', data = train, ax = ax[0,0])
plt.tight_layout()
plt.hist(x = [train[train['Survived']==1]['Pclass'], train[train['Survived']==0]['Pclass']], \
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Pclass Histogram by Survival')
plt.xlabel(u'乘客等级')
plt.xticks(train['Pclass'].value_counts().index)
plt.ylabel('# of Passengers')
plt.legend()
# 查看不同等级不同性别的存活情况
h = sns.FacetGrid(train, row = 'Sex', col = 'Pclass', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()
# 查看不同年龄的存活情况
a = sns.FacetGrid( train, hue = 'Survived', aspect=4 )
a.map(sns.kdeplot, 'Age', shade= True )
a.set(xlim=(0, train['Age'].max()))
a.add_legend()
# 不同等级中不同年龄分布情况
plt.figure(figsize=(8,6))
sns.kdeplot(train[(train['Pclass']==1) & (train['Age'].notnull()==True)]['Age'], shade=True).set(ylim = (0, 0.045))
sns.kdeplot(train[(train['Pclass']==2) & (train['Age'].notnull()==True)]['Age'], shade=True)
sns.kdeplot(train[(train['Pclass']==3) & (train['Age'].notnull()==True)]['Age'], shade=True)
plt.legend([u'头等舱', u'2等舱',u'3等舱'],loc='best')
# 查看‘cabin'字段是否是空对存活的影响
Survived_cabin = train.Survived[pd.notnull(train.Cabin)].value_counts()
Survived_nocabin = train.Survived[pd.isnull(train.Cabin)].value_counts()
df=pd.DataFrame({u'有cabin':Survived_cabin, u'无cabin':Survived_nocabin}).transpose()
df.plot(kind='bar')
plt.xticks(rotation = 0)
# 查看不同Embark对存活的影响
sns.countplot(x = 'Embarked',hue = 'Survived', orient='h', data = train)
fig = plt.figure(figsize=(10,8))
fig.set(alpha=0.2) # 设定图表颜色alpha参数
plt.subplot2grid((2,3),(0,0))
sns.countplot(x='Survived',data=train)
# df.plot(kind='bar')
plt.subplot2grid((2,3),(0,1))
# sns.countplot(x='Pclass',data=train)
sns.countplot(x = 'Pclass',hue = 'Survived', orient='h', data = train)
plt.subplot2grid((2,3),(0,2))
sns.violinplot('Survived', 'Age', data = train)#, ax = ax[1,0]).set(ylim = (-10, 80))
plt.subplot2grid((2,3),(1,0), colspan=2)
sns.kdeplot(train[(train['Pclass']==1) & (train['Age'].notnull()==True)]['Age'], shade=True)#.set(ylim = (0, 0.045))
sns.kdeplot(train[(train['Pclass']==2) & (train['Age'].notnull()==True)]['Age'], shade=True)
sns.kdeplot(train[(train['Pclass']==3) & (train['Age'].notnull()==True)]['Age'], shade=True)
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best')
plt.ylabel(u"密度",fontsize= 22)
plt.title(u"各等级的乘客年龄分布")
plt.ylim(0,0.05)
plt.subplot2grid((2,3),(1,2))
# sns.countplot('Embarked', data = train)
sns.countplot(x = 'Embarked',hue = 'Survived', orient='h', data = train)
plt.subplot2grid((2,3),(1,2))
sns.countplot(x = 'Sex',hue = 'Survived', orient='h', data = train)
plt.tight_layout()
plt.show()
、
# 查看不同性别的存活情况
sns.factorplot('Sex', 'Age', hue = 'Survived', estimator = np.mean, data = train,
size = 3, aspect = 1.4)
# 查看不同性别的存活情况
h = sns.FacetGrid(train, row = 'Sex', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()
plt.figure(figsize=[8,4])
plt.subplot(121)
plt.boxplot(x=train['Fare'], showmeans = True, meanline = True)
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')
plt.subplot(122)
plt.hist(x = [train[train['Survived']==1]['Fare'], train[train['Survived']==0]['Fare']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()
# 可见Fare越低,存活率越低
1.3.2 相关性分析
# corrmat = train.corr()
plt.subplots(figsize=(10,8))
corrmat = train[train.columns[1:]].corr()
sns.set(font_scale=1.25)
hm = sns.heatmap(corrmat, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10})
plt.title('Pearson Correlation of Features', y=1.05, size=15)
plt.show()
# 通过上图,可以初步看出
# (1)Pclass与Survived和Fare成反比,一定程度上反映了Pclass越大(等级越低),Fare相应的越低,Survived的几率越小;
# (2)Pclass越大(等级越低)的人群中Age偏小
# (3)Fare对Survived有正向作用
2 数据预处理
2.1 数据清洗
由1_dataExploration中的缺失值分析结果进行下面处理
# 对于Fare
alldata['Fare'] = alldata['Fare'].fillna(alldata['Fare'].mean()) # median()
# 对于Embarked
alldata['Embarked'] = alldata['Embarked'].fillna(alldata['Embarked'].mode()[0])
# 对于Age: 采取随机森林补充缺失值
from sklearn.ensemble import RandomForestRegressor
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目标年龄
y = known_age[:, 0]
# X即特征属性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=10, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的预测结果填补原缺失数据
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
return df, rfr
alldata, rfr = set_missing_ages(alldata)
# 对于Cabin
alldata['Cabin'].value_counts()
# 由上述结果可见,Cabin本身并没有什么规律,倒是前面的首字母可以提取出来,构造新属性
2.2 数据变换
2.2.1 属性构造
# 1、对于Cabin
alldata['CabinHead'] = alldata['Cabin'].str[0] # 等价于alldata['Cabin'].str.extract('(\S{1})')
alldata['CabinHead'] = alldata['CabinHead'].fillna('None')
alldata['CabinHead'].value_counts().sort_index()
train['CabinHead'] = train['Cabin'].str.extract('(\S{1})')
train['CabinHead'] = train['CabinHead'].fillna('None')
a = train['CabinHead'].value_counts().sort_index()
a.name = 'Cabinnum'
b= train[:891].groupby(['CabinHead'])['Survived'].sum().sort_index()
b.name = 'Survivednum'
c = pd.concat([a,b],axis=1)
c['survivedRate'] = c['Survivednum']/c['Cabinnum']*100
c
# 由上面结果可知,CabinHead为B D E 的存活率较高,创建新属性
alldata['CabinAlpha'] = (alldata['CabinHead'].isin(['B','D','E']))*1
根据有无Cabin创造新属性
alldata['NullCabin'] = (alldata['Cabin'].notnull()==True)*1
alldata['NullCabin'] = alldata['NullCabin'].fillna(0)
# 2、家族成员相关
alldata['NoSibSp'] = (alldata['SibSp']<=0)*1
alldata['NoParch'] = (alldata['Parch']<=0)*1
alldata['Family'] = alldata['SibSp'] + alldata['Parch'] + 1
alldata['isAlone'] = (alldata['Family']==1)*1
# 3、对于Ticket属性
Ticket = pd.DataFrame(alldata['Ticket'].value_counts())
Ticket.columns = ['PN']
Ticket
plt.subplot(111)
plt.hist(x = [train[train['Survived']==1]['Fare'], train[train['Survived']==0]['Fare']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()
# 由图可知,对于Ticket属性,存在相同的值,可以推测这是由于团购票的存在,因此,每个个体取对应均值
# 构建每个人的真实的票价
alldata1 = pd.merge(alldata, Ticket, left_on = 'Ticket',right_index = True)
alldata1['realFare'] = alldata1['Fare']/alldata1['PN']
# 4、对于Name属性,根据其中的称谓进行构建新属性
alldata1['Title'] = alldata1['Name'].str.split(", |\.", expand=True)[1]
# 由于'Ms','Mlle'都是Miss的意思,'Mme'同Mrs,因此,统一
alldata1.ix[alldata1['Title'].isin(['Ms','Mlle']),'Title' ] ='Miss'
alldata1.ix[alldata1['Title'].isin(['Mme']),'Title' ] ='Mrs'
alldata1['mother'] = ((alldata1['Sex']=='female') & (alldata1['Parch'] > 0)\
& (alldata1['Age']>=16)& (alldata1['Title']=='Mrs')) *1
# 转换称呼头衔,若数量大于stat_min的进行单独分类,其余的分为Misc类
stat_min = 10
title_names = (alldata1['Title'].value_counts() < stat_min)
alldata1['Title'] = alldata1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
2.2.2 连续属性离散化
alldata1['FareBin'] = pd.qcut(alldata1['realFare'], 4, labels=['low','norml','middle','high'])
alldata1['AgeBin'] = pd.cut(alldata1['Age'].astype(int), 5,labels=['yonth','youngAdult','middle','Senior','old'])
alldata2 = alldata1.drop(['Name','SibSp', 'Parch','Ticket', 'Fare',\
'Cabin','PN'],axis=1)
保存处理的数据
alldata2 = alldata2.sort_index()
alldata2.to_excel('1_afterdataprocessing.xlsx')
2.2.3 数据二值变换
# 方法一:dummy
typeList = ['Pclass','Sex','Embarked','CabinHead','Title','FareBin','AgeBin']
A = pd.concat([pd.get_dummies(alldata2[i],prefix = i) for i in typeList], axis=1)
B = alldata2[['NoSibSp', 'NoParch', 'NullCabin', 'CabinAlpha', 'Family', 'isAlone','mother','Age','realFare']]
alldata3 = pd.concat([A,B],axis = 1)
alldata3.head()
# 方法二:encode
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# label = LabelEncoder()
# alldata5['Sex'] = label.fit_transform(alldata2['Sex'])
# alldata5['Embarked'] = label.fit_transform(alldata2['Embarked'])
# alldata5['Title'] = label.fit_transform(alldata2['Title'])
# alldata5['AgeBin'] = label.fit_transform(alldata2['AgeBin'])
# alldata5['FareBin'] = label.fit_transform(alldata2['FareBin'])
# alldata5['CabinHead'] = label.fit_transform(alldata2['CabinHead'])
alldata3.columns
输出结果:
Index(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'CabinHead_A', 'CabinHead_B', 'CabinHead_C', 'CabinHead_D', 'CabinHead_E', 'CabinHead_F', 'CabinHead_G', 'CabinHead_None', 'CabinHead_T', 'Title_Master', 'Title_Misc', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'FareBin_low', 'FareBin_norml', 'FareBin_middle', 'FareBin_high', 'AgeBin_yonth', 'AgeBin_youngAdult', 'AgeBin_middle', 'AgeBin_Senior', 'AgeBin_old', 'NoSibSp', 'NoParch', 'NullCabin', 'CabinAlpha', 'Family', 'isAlone', 'mother', 'Age', 'realFare'], dtype='object')
X_train = alldata3[:len(train)]
X_test = alldata3[len(train):]
y_train = train1['Survived']
保存数据,用于建模
alldata3.to_excel('2_afterDataPreprocessing.xlsx')#
3 数据二次探索
train1 = pd.concat([alldata2.iloc[:len(train),:],train[['Survived']]],axis=1)
for x in train1.columns[:-1]:
if train1[x].dtype != 'float64' :
print('Survival Correlation by:', x)
print(train1[[x, 'Survived']].groupby(x, as_index=False).mean())
print('-'*10, '\n')
print(pd.crosstab(train1['Title'],train1['Survived']))
plt.figure(figsize=[16,12])
plt.subplot(231)
plt.boxplot(x=alldata2['realFare'], showmeans = True, meanline = True)
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')
plt.subplot(232)
plt.boxplot(alldata2['Age'], showmeans = True, meanline = True)
plt.title('Age Boxplot')
plt.ylabel('Age (Years)')
plt.subplot(233)
plt.boxplot(alldata2['Family'], showmeans = True, meanline = True)
plt.title('Family Size Boxplot')
plt.ylabel('Family Size (#)')
plt.subplot(234)
plt.hist(x = [train1[train1['Survived']==1]['realFare'], train1[train1['Survived']==0]['realFare']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()
plt.subplot(235)
plt.hist(x = [train1[train1['Survived']==1]['Age'], train1[train1['Survived']==0]['Age']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Age Histogram by Survival')
plt.xlabel('Age (Years)')
plt.ylabel('# of Passengers')
plt.legend()
plt.subplot(236)
plt.hist(x = [train1[train1['Survived']==1]['Family'], train1[train1['Survived']==0]['Family']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Family Size Histogram by Survival')
plt.xlabel('Family Size (#)')
plt.ylabel('# of Passengers')
plt.legend()
fig, saxis = plt.subplots(2, 3,figsize=(16,12))
sns.barplot(x = 'Embarked', y = 'Survived', data=train1, ax = saxis[0,0])
sns.barplot(x = 'Pclass', y = 'Survived', order=[1,2,3], data=train1, ax = saxis[0,1])
sns.barplot(x = 'isAlone', y = 'Survived', order=[1,0], data=train1, ax = saxis[0,2])
sns.pointplot(x = 'FareBin', y = 'Survived', data=train1, ax = saxis[1,0])
sns.pointplot(x = 'AgeBin', y = 'Survived', data=train1, ax = saxis[1,1])
sns.pointplot(x = 'Family', y = 'Survived', data=train1, ax = saxis[1,2])
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(16,6))
sns.boxplot(x = 'Pclass', y = 'realFare', hue = 'Survived', data =train1, ax = axis1)
axis1.set_title('Pclass vs Fare Survival Comparison')
sns.violinplot(x = 'Pclass', y = 'Age', hue = 'Survived', data = train1, split = True, ax = axis2)
axis2.set_title('Pclass vs Age Survival Comparison')
sns.boxplot(x = 'Pclass', y ='Family', hue = 'Survived', data = train1, ax = axis3)
axis3.set_title('Pclass vs Family Size Survival Comparison')
# fig = plt.figure()
# ax = fig.add_subplot(1,1,1)
fig, qaxis = plt.subplots(1,3,figsize=(16,6))
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Embarked', data=train1, ax = qaxis[0])
axis1.set_title('Sex vs Embarked Survival Comparison')
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', data=train1, ax = qaxis[1])
axis1.set_title('Sex vs Pclass Survival Comparison')
sns.barplot(x = 'Sex', y = 'Survived', hue = 'isAlone', data=train1, ax = qaxis[2])
axis1.set_title('Sex vs IsAlone Survival Comparison')
fig, (maxis1, maxis2) = plt.subplots(1, 2,figsize=(12,6))
#how does family size factor with sex & survival
sns.pointplot(x="Family", y="Survived", hue="Sex", data=train1,
palette={"male": "blue", "female": "pink"},
markers=["*", "o"], linestyles=["-", "--"], ax = maxis1)
#how does class factor with sex & survival
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train1,
palette={"male": "blue", "female": "pink"},
markers=["*", "o"], linestyles=["-", "--"], ax = maxis2)
e = sns.FacetGrid(train1, col = 'Embarked')
e.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', ci=95.0, palette = 'deep')
e.add_legend()
#plot distributions of Age of passengers who survived or did not survive
a = sns.FacetGrid(train1, hue = 'Survived', aspect=4 )
a.map(sns.kdeplot, 'Age', shade= True )
a.set(xlim=(0, train1['Age'].max()))
a.add_legend()
#histogram
h = sns.FacetGrid(train1, row = 'Sex', col = 'Pclass', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()
def correlation_heatmap(df):
_ , ax = plt.subplots(figsize =(14, 12))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
_ = sns.heatmap(
df.corr(),
cmap = colormap,
square=True,
cbar_kws={'shrink':.9 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':12 }
)
plt.title('Pearson Correlation of Features', y=1.05, size=15)
correlation_heatmap(train1)
pd.crosstab(train1['Family'],train1['Survived'])
g = sns.pairplot(train1, hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])
4 建立模型
alldata = pd.read_excel('2_afterDataPreprocessing.xlsx',index=False)
确定自变量和应变量的值:
X_train = alldata[:len(train)]
X_test = alldata[len(train):]
y_train = train['Survived']
# 定义评价函数
from sklearn import cross_validation
def rmsl(clf):
s = cross_validation.cross_val_score(clf, X_train, y_train, cv=5)
return (s.mean(),s.std())
本文建模时,思路是先建立基础模型,然后进行模型融合
4.1 基础模型
# 对比各个模型的最好的
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
MLA = [
#Ensemble Methods
ensemble.AdaBoostClassifier(),
ensemble.BaggingClassifier(),
ensemble.ExtraTreesClassifier(),
ensemble.GradientBoostingClassifier(),
ensemble.RandomForestClassifier(n_estimators = 60),
#Gaussian Processes
gaussian_process.GaussianProcessClassifier(),
#GLM
linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6),
linear_model.LogisticRegressionCV(),
linear_model.PassiveAggressiveClassifier(),
linear_model.RidgeClassifierCV(),
linear_model.SGDClassifier(),
linear_model.Perceptron(),
#Navies Bayes
naive_bayes.GaussianNB(),
#Nearest Neighbor
neighbors.KNeighborsClassifier(n_neighbors = 3),
#SVM
svm.SVC(probability=True),
svm.LinearSVC(),
#Trees
tree.DecisionTreeClassifier(),
tree.ExtraTreeClassifier(),
]
#create table to compare MLA
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy Min' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
#index through MLA and save performance to table
row_index = 0
for alg in MLA:
#set name and parameters
MLA_compare.loc[row_index, 'MLA Name'] = alg.__class__.__name__
MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
#score model with cross validation:
cv_results = model_selection.cross_validate(alg, X_train, y_train, cv =5 )
MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
MLA_compare.loc[row_index, 'MLA Test Accuracy Min'] = cv_results['test_score'].min() #let's know the worst that can happen!
row_index+=1
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#cv_results = model_selection.cross_validate(alg, X_train, y_train, cv = 5) # 优于cross_validation.cross_val_score
#cv_results
4.1.1参数调节
param_grid = {'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random'],
'max_depth': [None, 2,4,6,8,10],
'min_samples_split': [5,10,15,20,25],
'max_features': [None, 'auto', 'sqrt', 'log2']
}
tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'accuracy', cv = 5)
cv_results = model_selection.cross_validate(tune_model, X_train, y_train, cv = 5)
print(tune_model.get_params())
print(cv_results['train_score'].mean())
print(cv_results['test_score'].mean())
print(cv_results['test_score'].min())
tune_model.get_params()
# 画图比较各个算法的效率
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')
4.1.2 结果预测
# 由上面结果可知,预测效果最优的前几位是
# BaggingClassifier、GradientBoostingClassifier、RidgeClassifierCV、LogisticRegression、
# RandomForestClassifier、AdaBoostClassifier、LogisticRegressionCV
MLA_best = [
#Ensemble Methods
ensemble.AdaBoostClassifier(), # 0.76076
ensemble.BaggingClassifier(), # 0.72248
ensemble.GradientBoostingClassifier(), # 0.73684
ensemble.RandomForestClassifier(n_estimators = 60), # 0.72727
#GLM
linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6), # 0.77990
linear_model.RidgeClassifierCV(), # 0.77033
linear_model.LogisticRegressionCV() #0.77033
]
row_index = 0
for alg in MLA_best:
algname = alg.__class__.__name__
alg.fit(X_train, y_train)
predictions = alg.predict(X_test)
result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv(algname+".csv", index=False)
row_index+=1
4.2 模型融合
4.2.1 模型融合1(普通融合-投票法)
voting_est = [
#Ensemble Methods
('ada', ensemble.AdaBoostClassifier()),
('bc', ensemble.BaggingClassifier()),
('etc',ensemble.ExtraTreesClassifier()),
('gbc', ensemble.GradientBoostingClassifier()),
('rfc', ensemble.RandomForestClassifier(n_estimators = 100)),
#Gaussian Processes
('gpc', gaussian_process.GaussianProcessClassifier()),
#GLM - remove linear models, since this is a classifier algorithm
('lrcv', linear_model.LogisticRegressionCV()),
('lr', linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)),
#('pac', linear_model.PassiveAggressiveClassifier()),
# ('rc', linear_model.RidgeClassifierCV()), # voting-soft的时候出错,提示has no attribute 'predict_proba'
#('sgd', linear_model.SGDClassifier()),
#('pct', linear_model.Perceptron()),
#Navies Bayes
# ('gnb', naive_bayes.GaussianNB()),
#Nearest Neighbor
('knn', neighbors.KNeighborsClassifier(n_neighbors = 3)),
#SVM
('svc', svm.SVC(probability=True)),
#('lsvc', svm.LinearSVC()),
#Trees
('dtc', tree.DecisionTreeClassifier()),
('etc2', tree.ExtraTreeClassifier()),
]
#Hard Vote or majority rules
voting_hard = ensemble.VotingClassifier(estimators = voting_est , voting = 'hard')
voting_hard_cv = model_selection.cross_validate(voting_hard, X_train, y_train, cv = 5)
voting_hard.fit(X_train, y_train)
print("Hard Voting Training w/bin score mean: {:.2f}". format(voting_hard_cv['train_score'].mean()*100))
print("Hard Voting Test w/bin score mean: {:.2f}". format(voting_hard_cv['train_score'].mean()*100))
print("Hard Voting Test w/bin score min: {:.2f}". format(voting_hard_cv['train_score'].min()*100))
# 如果“硬”,则使用预测的类标签进行多数规则投票。
# 否则,如果“软”,则基于预测概率总和的argmax预测类别标签,这对于经过良好校准的分类器的集合是推荐的。
#Soft Vote or weighted probabilites
voting_soft = ensemble.VotingClassifier(estimators = voting_est , voting = 'soft')
voting_soft_cv = model_selection.cross_validate(voting_soft, X_train, y_train, cv =5)
voting_soft.fit(X_train, y_train)
print("Soft Voting Training w/bin score mean: {:.2f}". format(voting_soft_cv['train_score'].mean()*100))
print("Soft Voting Test w/bin score mean: {:.2f}". format(voting_soft_cv['train_score'].mean()*100))
print("Soft Voting Test w/bin score min: {:.2f}". format(voting_soft_cv['train_score'].min()*100))
predictions = voting_soft.predict(X_test)
result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("voting_soft.csv", index=False) # 0.73684
predictions = voting_hard.predict(X_test)
result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("voting_hard.csv", index=False) # 0.75119
4.2.2 模型融合2:stacking
第一层:Stacking models
ntrain = train.shape[0] #891
ntest = test.shape[0] #418
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
# 封装算法基本操作
class SklearnHelper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
return self.clf.fit(x,y).feature_importances_
# 定义求五折交叉验证的方法
def get_oof(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) # 想让z变成只有一列,行数不知道多少
# 定义四个不同的弱分类器的参数值
# Random Forest parameters
rf_params = {
'n_jobs': -1,'n_estimators': 500,'warm_start': True, 'max_depth': 6,'min_samples_leaf': 2,
'max_features' : 'sqrt','verbose': 0#'max_features': 0.2,
}
# Extra Trees Parameters
et_params = {
'n_jobs': -1,'n_estimators':500,'max_depth': 8,'min_samples_leaf': 2,'verbose': 0
#'max_features': 0.5,
}
# AdaBoost parameters
ada_params = {
'n_estimators': 500,'learning_rate' : 0.75
}
# Gradient Boosting parameters
gb_params = {
'n_estimators': 500,'max_depth': 5,'min_samples_leaf': 2, 'verbose': 0
#'max_features': 0.2,
}
# Support Vector Classifier parameters
# svc_params = {
# 'kernel' : 'linear','C' : 0.025
# }
# 创建四个若分类器模型
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
X_train = X_train.values # 此处需要转成数组型
X_test = X_test.values
# 使用五折交叉方法分别计算出使用不同算法的预测结果,这些结果将用于Stacking的第二层预测
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost
gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test) # Gradient Boost
# 计算出不同算法得出的不同字段的重要程度
rf_feature = rf.feature_importances(X_train,y_train)
et_feature = et.feature_importances(X_train, y_train)
ada_feature = ada.feature_importances(X_train, y_train)
gb_feature = gb.feature_importances(X_train,y_train)
cols = alldata.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
'Random Forest feature importances': rf_feature,
'Extra Trees feature importances': et_feature,
'AdaBoost feature importances': ada_feature,
'Gradient Boost feature importances': gb_feature
})
feature_dataframe
画图,查看各个特征的重要性(用go包里的画图函数画的图,互动图)
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Random Forest feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True,
color = feature_dataframe['Random Forest feature importances'].values,
# size = feature_dataframe['AdaBoost feature importances'].values,
# color = np.random.randn(500)#set color equal to a variable
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Random Forest Feature Importance',
hovermode= 'closest',
# xaxis= dict( # 不显示X轴标题
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter20170207')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Extra Trees feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True,
color = feature_dataframe['Extra Trees feature importances'].values,
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Extra Trees Feature Importance',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',ticklen= 5,gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter20170207_2')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['AdaBoost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True,
color = feature_dataframe['AdaBoost feature importances'].values,
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'AdaBoost feature importances',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',ticklen= 5,gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter20170207_3')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Gradient Boost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',sizeref = 1,size = 25,colorscale='Portland',showscale=True,
color = feature_dataframe['Gradient Boost feature importances'].values,
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Gradient Boost feature importances',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',ticklen= 5,gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter20170207_4')
eature_dataframe['mean'] = feature_dataframe.mean(axis= 1)
feature_dataframe.head(3)
画出各个特征的平均重要性
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
x= x,
y= y,
width = 0.5,
marker=dict(
color = feature_dataframe['mean'].values,
colorscale='Portland',
showscale=True,
reversescale = False
),
opacity=0.6
)]
layout= go.Layout(
autosize= True,
title= 'Barplots of Mean Feature Importance',
hovermode= 'closest',
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')
第二层: 以第一层的结果预测
base_predictions_train = pd.DataFrame( {
'RandomForest': rf_oof_train.ravel(),# # ravel函数在降维时默认是行序优先
'ExtraTrees': et_oof_train.ravel(),
'AdaBoost': ada_oof_train.ravel(),
'GradientBoost': gb_oof_train.ravel()
})
base_predictions_train.head()
训练集中各个特征的相关性分析
data = [
go.Heatmap(
z= base_predictions_train.astype(float).corr().values ,
x=base_predictions_train.columns.values,
y= base_predictions_train.columns.values,
colorscale='Viridis',
showscale=True,
reversescale = True
)
]
py.iplot(data, filename='labelled-heatmap')
X_train2 = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
X_test2 = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
使用XGboost作为学习器
gbm = xgb.XGBClassifier(
#learning_rate = 0.02,
n_estimators= 2000,
max_depth= 4,
min_child_weight= 2,
#gamma=1,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread= -1,
scale_pos_weight=1).fit(X_train2, y_train)
predictions = gbm.predict(X_test2)
# Generate Submission File
StackingSubmission = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived': predictions })
StackingSubmission.to_csv("StackingSubmission.csv", index=False) # 0.77990