第一篇:https://blog.csdn.net/Nicht_sehen/article/details/89741145
这一篇着重记录模型选择问题,不会过多做特征工程
数据处理
- 异:
增加Familysize 看家庭人数,增加Isalone看是否独自一人
drop 掉了’PassengerId’, ‘Cabin’, ‘Ticket’ - 同
Age,Embarked,Fare,Agecut, Farecut,Identify 处理相同
df_train=pd.read_csv("../input/train.csv")
df_test=pd.read_csv("../input/test.csv")
data_cleaner=[df_train,df_test]
for dataset in data_cleaner:
dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
dataset['IsAlone'] = 1
dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0
drop_column = ['PassengerId', 'Cabin', 'Ticket']
data1=df_train.copy()
data1.drop(drop_column, axis=1, inplace=True)
print(dataset.info())
print(data1.info())
使用LabelEncoder对特征值进行编码
label = LabelEncoder()
for dataset in data_cleaner:
dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
dataset['Title_Code'] = label.fit_transform(dataset['Title'])
dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
data1['Sex_Code'] = label.fit_transform(data1['Sex'])
data1['Embarked_Code'] = label.fit_transform(data1['Embarked'])
data1['Title_Code'] = label.fit_transform(data1['Title'])
data1['AgeBin_Code'] = label.fit_transform(data1['AgeBin'])
data1['FareBin_Code'] = label.fit_transform(data1['FareBin'])
挑选特征并进行one-hot编码
Target = ['Survived']
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp'