kaggle上泰坦尼克号的代码实现,可以实现top4%的排名,核心代码来自
https://www.kaggle.com/jianghuiying/titanic/titanic-random-forest-82-78-0465a3/editnb
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')
train['Survived'].value_counts(normalize=True)
#这个可以这样理解,survival本来的获救率为0.63加上pclass不同等级获救率降低了有的,说明该变量和最后的预测结果之间是有关系的.
train['Survived'].groupby(train['Pclass']).mean()
#对于含有,.的字符串我们可以split分割得到我们想要的字符串
train['Name_Title']=train['Name'].map(lambda x:x.split(',')[1]).map(lambda x:x.split()[0])
train['Name_Title'].value_counts()
#这行语句很关键,它直接反映了特征变量的不同取值和最后的预测变量之间的关系,有的取值可能和预测变量的关系比较大,有的并不
#我们可以细分出许多变量,然后再对这些变量进行重要性排序.我们细分的越详细可能效果越理想.所以我们首先数据清洗要做的就是1观察是否有缺失值以及怎样填充
#2对所有的特征变量以及相应的取值观察他们和他们和预测变量之间的关系.
train['Survived'].groupby(train['Name_Title']).mean()
#train['Survived'].groupby(pd.qcut(train['Name_Len'],5)).mean()
#如果是离散型变量,我们可以直接groupby计算相应分组的均值.如果是连续型变量
#我们则需要将其进行分区间离散化,然后再groupby进行均值计算
#qcut可以根据样本的分位数对数据进行面元划分,其操作对象是Series,数组等
train['Name_Len']=train['Name'].map(lambda x:len(x))
train['Survived'].groupby(pd.qcut(train['Name_Len'],5)).mean()
pd.qcut(train['Name_Len'],5).value_counts()
train['Sex'].value_counts()
train['Survived'].groupby([train['Sex']]).mean()
#我们首先对那些非空的年龄进行观察,观察其和目标变量的关系
train['Survived'].groupby(train['Age'].isnull()).mean()
train['Survived'].groupby(pd.qcut(train['Age'],5)).mean()
pd.qcut(train['Age'],5).value_counts()
train['Survived'].groupby(train['SibSp']).mean()
train['SibSp'].value_counts()
train['Survived'].groupby(train['Parch']).mean()
train['Parch'].value_counts()
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
train['Ticket_Len'].value_counts()
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
train['Ticket_Lett'].value_counts()
train.groupby(['Ticket_Lett'])['Survived'].mean()
train['Survived'].groupby(pd.qcut(train['Fare'], 3)).mean()
#crosstab生成一个表格,该表格可以观察train['Fare']和train['Pclass']的关系,
#比如刚那两个变量我们觉得他们可能表示的是同一个特征信息,这时候就可以通过这个表格来观
#察,这样写也可以pd.crosstab(pd.qcut(train['Fare'], 5), train['Pclass'])
pd.crosstab(pd.qcut(train['Fare'], 5), columns=train['Pclass'])
train['Cabin_Letter'] = train['Cabin'].apply(lambda x: str(x)[0])
train['Cabin_Letter'].value_counts()
train['Survived'].groupby(train['Cabin_Letter']).mean()
#Cabin虽然有七百多的缺失值,但我们依然可以提取一些信息.对于一个取值,例如'C123'后面的数字越大可能获救的可能性越大,因此我们提取Cabin_num
train['Cabin_num'] = train['Cabin'].apply(lambda x: str(x).split(' ')[-1][1:])
train['Cabin_num'].replace('an', np.NaN, inplace = True)
train['Cabin_num'] = train['Cabin_num'].apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
train['Survived'].groupby(pd.qcut(train['Cabin_num'], 3)).mean()
train['Survived'].corr(train['Cabin_num'])
train['Survived'].groupby(train['Embarked']).mean()
def names(train, test):
for i in [train, test]:
i['Name_Len'] = i['Name'].apply(lambda x: len(x))
i['Name_Title'] = i['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
del i['Name']
return train, test
def age_impute(train, test):
for i in [train, test]:
i['Age_Null_Flag'] = i['Age'].apply(lambda x: 1 if pd.isnull(x) else 0)
data = train.groupby(['Name_Title', 'Pclass'])['Age']
i['Age'] = data.transform(lambda x: x.fillna(x.mean()))
return train, test
def fam_size(train, test):
for i in [train, test]:
i['Fam_Size'] = np.where((i['SibSp']+i['Parch']) == 0 , 'Solo',
np.where((i['SibSp']+i['Parch']) <= 3,'Nuclear', 'Big'))
del i['SibSp']
del i['Parch']
return train, test
def ticket_grouped(train, test):
for i in [train, test]:
i['Ticket_Lett'] = i['Ticket'].apply(lambda x: str(x)[0])
i['Ticket_Lett'] = i['Ticket_Lett'].apply(lambda x: str(x))
i['Ticket_Lett'] = np.where((i['Ticket_Lett']).isin(['1', '2', '3', 'S', 'P', 'C', 'A']), i['Ticket_Lett'],
np.where((i['Ticket_Lett']).isin(['W', '4', '7', '6', 'L', '5', '8']),
'Low_ticket', 'Other_ticket'))
i['Ticket_Len'] = i['Ticket'].apply(lambda x: len(x))
del i['Ticket']
return train, test
def cabin(train, test):
for i in [train, test]:
i['Cabin_Letter'] = i['Cabin'].apply(lambda x: str(x)[0])
del i['Cabin']
return train, test
def cabin_num(train, test):
for i in [train, test]:
i['Cabin_num1'] = i['Cabin'].apply(lambda x: str(x).split(' ')[-1][1:])
i['Cabin_num1'].replace('an', np.NaN, inplace = True)
i['Cabin_num1'] = i['Cabin_num1'].apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
i['Cabin_num'] = pd.qcut(train['Cabin_num1'],3)
train = pd.concat((train, pd.get_dummies(train['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
test = pd.concat((test, pd.get_dummies(test['Cabin_num'], prefix = 'Cabin_num')), axis = 1)
del train['Cabin_num']
del test['Cabin_num']
del train['Cabin_num1']
del test['Cabin_num1']
return train, test
def embarked_impute(train, test):
for i in [train, test]:
i['Embarked'] = i['Embarked'].fillna('S')
return train, test
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
def dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett', 'Cabin_Letter', 'Name_Title', 'Fam_Size']):
for column in columns:
train[column] = train[column].apply(lambda x: str(x))
test[column] = test[column].apply(lambda x: str(x))
good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
del train[column]
del test[column]
return train, test
def drop(train, test):
for i in [train, test]:
del i['PassengerId']
return train, test
train = pd.read_csv(os.path.join('../input', 'train.csv'))
test = pd.read_csv(os.path.join('../input', 'test.csv'))
train, test = names(train, test)
train, test = age_impute(train, test)
train, test = cabin_num(train, test)
train, test = cabin(train, test)
train, test = embarked_impute(train, test)
train, test = fam_size(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = ticket_grouped(train, test)
train, test = dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = drop(train, test)
train.iloc[:, 1:]
train.iloc[:, 0]
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='gini',
n_estimators=700,
min_samples_split=10,
min_samples_leaf=1,
max_features='auto',
oob_score=True,
random_state=1,
n_jobs=-1)
rf.fit(train.iloc[:, 1:], train.iloc[:, 0])
print("%.4f" % rf.oob_score_)
pd.concat((pd.DataFrame(train.iloc[:, 1:].columns, columns = ['variable']),
pd.DataFrame(rf.feature_importances_, columns = ['importance'])),
axis = 1).sort_values(by='importance', ascending = False)[:20]
predictions = rf.predict(test)
predictions = pd.DataFrame(predictions, columns=['Survived'])
test = pd.read_csv(os.path.join('../input', 'test.csv'))
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv('y_test15.csv', sep=",", index = False)