kaggle入门实践titanic的代码注释,原文链接http://ahmedbesbes.com/how-to-score-08134-in-titanic-kaggle-challenge.html
import warnings
warnings.filterwarnings('ignore')
#该行是指在notebook下将图片内嵌在交互窗口,而不是弹出一个窗口
%matplotlib inline
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
data=pd.read_csv('../input/train.csv')
data.head()
#通过data.describe()我们可以发现每一个特征的均值方差个数等等,发现哪些特征是含有缺失值的.
data.describe()
#发现年龄是有缺失值的,采用中值进行填充.比均值鲁棒性要好
data['Age'].fillna(data['Age'].median(),inplace=True)
data.describe()
#value_counts()表示对一列的取值的频数进行统计,对于离散型变量可以通过柱状图观察该特征和最后预测变量的关系
survived_sex=data[data['Survived']==1]['Sex'].value_counts()
dead_sex=data[data['Survived']==0]['Sex'].value_counts()
df=pd.DataFrame([survived_sex,dead_sex])
df.index=['Survived','Dead']
df.plot(kind='bar',stacked=True, figsize=(13,8))
#下面来观察下连续性变量年龄的统计情况,可以通过直方图来观察
#创建图标figure
figure=plt.figure(figsize=(13,8))
plt.hist([data[data['Survived']==1]['Age'],data[data['Survived']==0]['Age']], stacked=True,color= ['g','r'],
bins = 30,label = ['Survived','Dead'])
plt.xlabel('Age')
plt.ylabel('Number of passengers')
plt.legend()
#同样的方法使用连续值来进行观察fare和最后的预测值之间的关系
figure=plt.figure(figsize=(13,8))
plt.hist([data[data['Survived']==1]['Fare'],data[data['Survived']==0]['Fare']],stacked=True,color=['g','r'],bins=30,label=['Survived','Dead'])
plt.xlabel('Fare')
plt.ylabel('Number of passengers')
plt.legend()
#设置尺寸大小
plt.figure(figsize=(13,8))
ax = plt.subplot()
ax.scatter(data[data['Survived']==1]['Age'],data[data['Survived']==1]['Fare'],c='green',s=40)
ax.scatter(data[data['Survived']==0]['Age'],data[data['Survived']==0]['Fare'],c='red',s=40)
ax.set_xlabel('Age')
ax.set_ylabel('Fare')
ax.legend(('survived','dead'),scatterpoints=1,loc='upper left',fontsize=15,)`
#描述两个变量和最后的目标变量的关系需要用到散点图,这个方法可以产生组合变量,对组合对进行观察
plt.figure(figsize=(13,8))
ax=plt.subplot()
ax.scatter(data[data['Survived']==1]['Age'],data[data['Survived']==1]['Fare'],c='green',s=40)
ax.scatter(data[data['Survived']==0]['Age'],data[data['Survived']==0]['Fare'],c='red',s=40)
ax.set_xlabel('Age')
ax.set_ylabel('Fare')
ax.legend(('Survived','Dead'),scatterpoints=1,loc='upper right',fontsize=15)
#观察连续值和离散值之间的关系,通过groupby分组的方式.实现的是对离散值pclass进行三个分组,然后对数据进行计算Fare的均值.
ax = plt.subplot()
ax.set_ylabel('Average fare')
data.groupby('Pclass').mean()['Fare'].plot(kind='bar',figsize=(13,8),ax=ax)
survived_embark = data[data['Survived']==1]['Embarked'].value_counts()
dead_embark = data[data['Survived']==0]['Embarked'].value_counts()
df = pd.DataFrame([survived_embark,dead_embark])
df.index = ['Survived','Dead']
df.plot(kind='bar',stacked=True, figsize=(13,8))
def status(feature):
print ('processing',feature,':ok')
def get_combined_data():
#数据的读取
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
#提取目标值,去掉目标值
targets=train.Survived
train.drop('Survived',1,inplace=True)
#将训练集和测试集结合生产combined,加上索引,去掉索引.使用append()时需要保证train和test的列名一致
combined=train.append(test,ignore_index=True)
#combined.reset_index(inplace=True)
#combined.drop('index',inplace=True,axis=1)
return combined
combined=get_combined_data()
combined
def get_titles():
global combined
# we extract the title from each name
combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
# a map of more aggregated titles
Title_Dictionary = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
# we map each title
combined['Title'] = combined.Title.map(Title_Dictionary)
get_titles()
combined.head()
grouped = combined.groupby(['Sex','Pclass','Title'])
grouped.median()
def process_age():
global combined
# a function that fills the missing values of the Age variable
def fillAges(row):
if row['Sex']=='female' and row['Pclass'] == 1:
if row['Title'] == 'Miss':
return 30
elif row['Title'] == 'Mrs':
return 45
elif row['Title'] == 'Officer':
return 49
elif row['Title'] == 'Royalty':
return 39
elif row['Sex']=='female' and row['Pclass'] == 2:
if row['Title'] == 'Miss':
return 20
elif row['Title'] == 'Mrs':
return 30
elif row['Sex']=='female' and row['Pclass'] == 3:
if row['Title'] == 'Miss':
return 18
elif row['Title'] == 'Mrs':
return 31
elif row['Sex']=='male' and row['Pclass'] == 1:
if row['Title'] == 'Master':
return 6
elif row['Title'] == 'Mr':
return 41.5
elif row['Title'] == 'Officer':
return 52
elif row['Title'] == 'Royalty':
return 40
elif row['Sex']=='male' and row['Pclass'] == 2:
if row['Title'] == 'Master':
return 2
elif row['Title'] == 'Mr':
return 30
elif row['Title'] == 'Officer':
return 41.5
elif row['Sex']=='male' and row['Pclass'] == 3:
if row['Title'] == 'Master':
return 6
elif row['Title'] == 'Mr':
return 26
combined.Age = combined.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis=1)
status('age')
process_age()
def process_names():
global combined
# we clean the Name variable
combined.drop('Name',axis=1,inplace=True)
# encoding in dummy variable
titles_dummies = pd.get_dummies(combined['Title'],prefix='Title')
combined = pd.concat([combined,titles_dummies],axis=1)
# removing the title variable
combined.drop('Title',axis=1,inplace=True)
status('names')`
process_names()
def process_fares():
global combined
# there's one missing fare value - replacing it with the mean.
combined.Fare.fillna(combined.Fare.mean(),inplace=True)
status('fare')
process_fares()
def process_embarked():
global combined
# two missing embarked values - filling them with the most frequent one (S)
combined.Embarked.fillna('S',inplace=True)
# dummy encoding
embarked_dummies = pd.get_dummies(combined['Embarked'],prefix='Embarked')
combined = pd.concat([combined,embarked_dummies],axis=1)
combined.drop('Embarked',axis=1,inplace=True)
status('embarked')
process_embarked()
def process_cabin():
global combined
# replacing missing cabins with U (for Uknown)
combined.Cabin.fillna('U',inplace=True)
# mapping each Cabin value with the cabin letter
combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])
# dummy encoding ...
cabin_dummies = pd.get_dummies(combined['Cabin'],prefix='Cabin')
combined = pd.concat([combined,cabin_dummies],axis=1)
combined.drop('Cabin',axis=1,inplace=True)
status('cabin')
process_cabin()
def process_sex():
global combined
# mapping string values to numerical one
combined['Sex'] = combined['Sex'].map({'male':1,'female':0})
status('sex')
process_sex()
def process_pclass():
global combined
# encoding into 3 categories:
pclass_dummies = pd.get_dummies(combined['Pclass'],prefix="Pclass")
# adding dummy variables
combined = pd.concat([combined,pclass_dummies],axis=1)
# removing "Pclass"
combined.drop('Pclass',axis=1,inplace=True)
status('pclass')
process_pclass()
def process_ticket():
global combined
def cleanTicket(ticket):
#replace的意思是把.替换为无
ticket=ticket.replace('.','')
ticket=ticket.replace('/','')
#split把一个字符串按照空格分割为list
ticket=ticket.split()
#strip表示把一个字符串中的前后空格给去掉
ticket=map(lambda t:t.strip(),ticket)
#filter表示保留非纯数字的字符串,ticket从split之后就变为了一个list
ticket=filter(lambda t:not t.isdigit(),ticket)
#如果list的长度为正,则返回第一个,否则返回XXX
if len(ticket)>0:
return ticket[0]
else:
return 'XXX'
combined['Ticket'] = combined['Ticket'].map(cleanTicket)
tickets_dummies = pd.get_dummies(combined['Ticket'],prefix='Ticket')
combined = pd.concat([combined, tickets_dummies],axis=1)
combined.drop('Ticket',inplace=True,axis=1)
process_ticket()
#这里我们人为创建了一个变量家庭成员的个数,因为一个家庭的更可能获救
def process_family():
global combined
combined['FamilySize'] = combined['Parch'] + combined['SibSp'] + 1
combined['Singleton'] = combined['FamilySize'].map(lambda s : 1 if s == 1 else 0)
combined['SmallFamily'] = combined['FamilySize'].map(lambda s : 1 if 2<=s<=4 else 0)
combined['LargeFamily'] = combined['FamilySize'].map(lambda s : 1 if 5<=s else 0)
status('family')
process_family()
#对所有的特征进行归一化
def scale_all_features():
global combined
features=list(combined.columns)
features.remove('PassengerId')
combined[features]=combined[features].apply(lambda x:x/x.max(),axis=0)
print 'Features scaled successfully !'
scale_all_features()
建立我们的model,采用随机森林的方法,对于随机森林中的参数选择采用grid_search的方法遍历搜索找到最合适的参数对.模块见from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
def compute_score(clf, X, y,scoring='accuracy'):
xval = cross_val_score(clf, X, y, cv = 5,scoring=scoring)
return np.mean(xval)
#从我们之前的combined中恢复我们的train,targets,test
def recover_train_test_target():
global combined
train0=pd.read_csv('../input/train.csv')
targets=train0.Survived
#ix是获取多少行的索引
train=combined.ix[0:890]
test=combined.ix[891:]
return train,test,targets
特征选择.我们要从中选择和我们的预测变量survived比较相关的特征,去除不太相关的冗余的特征.并且特征选择可以加快训练速度,减小过拟合的程度.
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf=ExtraTreesClassifier(n_estimators=200)
clf=clf.fit(train,targets)
features=pd.DataFrame()
features['feature']=train.columns
features['importance']=clf.feature_importances_
#按照某一列的值大小对features进行降序排序
features.sort(['importance'],ascending=False)
model = SelectFromModel(clf, prefit=True)
train_new = model.transform(train)
train_new.shape
test_new = model.transform(test)
test_new.shape
随机森林的参数调整
forest = RandomForestClassifier(max_features='sqrt')
parameter_grid = {
'max_depth' : [4,5,6,7,8],
'n_estimators': range(200,300,10)
'criterion': ['gini','entropy']
}
cross_validation = StratifiedKFold(targets, n_folds=5)
grid_search = GridSearchCV(forest,
param_grid=parameter_grid,
cv=cross_validation)
grid_search.fit(train_new, targets)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
output = grid_search.predict(test_new).astype(int)
df_output = pd.DataFrame()
df_output['PassengerId'] = test['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('../input/output.csv',index=False)