kaggle入门比赛-泰坦尼克号生存预测（python）

最新推荐文章于 2022-09-22 13:57:24 发布

大浪淘沙_

最新推荐文章于 2022-09-22 13:57:24 发布

阅读量340

点赞数

分类专栏： python 文章标签： kaggle 数据分析

本文链接：https://blog.csdn.net/asd966521/article/details/116705877

版权

python 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

数据集获取路径：https://www.kaggle.com/c/titanic/data

不同的人在选取特征时会有不同的取舍，就此数据集，我认为所有特征都会影响最终结果，故保存了所有特征，即便有的特征的空值极多。

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore", category=Warning)
from sklearn.feature_selection import VarianceThreshold
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import naive_bayes


#1、数据准备+数据处理
test_data = pd.read_csv('/Users/huangzhen/Desktop/titanic/test.csv')
train_data = pd.read_csv('/Users/huangzhen/Desktop/titanic/train.csv')
#将训练集和测试集整合一起进行数据处理
full = pd.concat([train_data, test_data], ignore_index=True)
#查看是否有空值
# print(full.isnull().sum())
#去掉特殊符号
full['split_name'] = full['Name'].apply(lambda x:x.split(',')[1].split('.')[0])
# print(full['split_name'].unique())
#age为空的split_name
a = full.loc[full['Age'].isnull(), 'split_name']
# print(a)
# print(a.unique())
#根据split_name,补充对应的age的均值
for i in a:
    full['Age'].fillna(full[full['split_name']==i]['Age'].mean(), inplace=True)
# print(full[full['Fare'].isnull()]['Ticket'])
full['Fare'].fillna(full[(full['Embarked'])=='S']['Fare'].mean(), inplace=True)
# print(full[full['Ticket']==3701]['Fare'].values)
# print(full['Embarked'].value_counts())
full['Embarked'].fillna('S', inplace=True)
# print(full.isnull().sum())
# print(full[full['Pclass']==0]['Cabin'].value_counts())  #3:g,f,e    2:f,d,e     1:c,b,d,e   6种3个等级(a,t)
#根据pclass不同，随机填充cabin的等级
for i in full['Pclass']:
    if i ==3:
        full['Cabin'].fillna(np.random.choice(['g', 'f']),inplace=True)
    elif i ==2:
        full['Cabin'].fillna(np.random.choice(['d', 'e']),inplace=True)
    else:
        full['Cabin'].fillna(np.random.choice(['b','c']),inplace=True)
# print(full.isnull().sum())
full['Cabin'] = [x[0] for x in full['Cabin']]
# print(full.isnull().sum())
full['Cabin'] = [x.upper() for x in full['Cabin']]
# print(full['Cabin'].unique())
# print(full[:891])

#2、特征工程，对所有特征进行编码，归一，降维
sex_label = LabelEncoder()
sex_label_coder = sex_label.fit_transform(full['Sex'].values)
sex_onehot = OneHotEncoder()
sex_label_onehot = sex_onehot.fit_transform(sex_label_coder.reshape(-1,1))
train_sex_feat = sex_label_onehot[:891]
test_sex_feat = sex_label_onehot[891:]

cabin_label = LabelEncoder()
cabin_label_coder = cabin_label.fit_transform(full['Cabin'].values)
cabin_onehot = OneHotEncoder()
cabin_label_onehot = cabin_onehot.fit_transform(cabin_label_coder.reshape(-1,1))
train_cabin_feat = cabin_label_onehot[:891]
test_cabin_feat = cabin_label_onehot[891:]

embarked_label = LabelEncoder()
embarked_label_coder = embarked_label.fit_transform(full['Embarked'].values)
embarked_onehot = OneHotEncoder()
embarked_label_onehot = embarked_onehot.fit_transform(embarked_label_coder.reshape(-1,1))
train_embarked_feat = embarked_label_onehot[:891]
test_embarked_feat = embarked_label_onehot[891:]

ticket_label = LabelEncoder()
ticket_label_coder = ticket_label.fit_transform(full['Ticket'].values)
ticket_onehot = OneHotEncoder()
ticket_label_onehot = ticket_onehot.fit_transform(ticket_label_coder.reshape(-1,1))
train_ticket_feat = ticket_label_onehot[:891]
test_ticket_feat = ticket_label_onehot[891:]

pclass_label_onehot = full['Pclass'].values.reshape(-1,1)
train_pclass_feat = pclass_label_onehot[:891]
test_pclass_feat = pclass_label_onehot[891:]

age_label_onehot = full['Age'].values.reshape(-1,1)
train_age_feat = age_label_onehot[:891]
test_age_feat = age_label_onehot[891:]

slisp_label_onehot = full['SibSp'].values.reshape(-1,1)
train_slisp_feat = slisp_label_onehot[:891]
test_slisp_feat = slisp_label_onehot[891:]

parch_label_onehot = full['Parch'].values.reshape(-1,1)
train_parch_feat = parch_label_onehot[:891]
test_parch_feat = parch_label_onehot[891:]

fare_label_onehot = full['Fare'].values.reshape(-1,1)
train_fare_feat = fare_label_onehot[:891]
test_fare_feat = fare_label_onehot[891:]

#整合训练集，测试集特征
train_feat =np.hstack((train_sex_feat.toarray(), train_cabin_feat.toarray(),train_embarked_feat.toarray(),
                      train_ticket_feat.toarray(), train_pclass_feat, train_age_feat, train_slisp_feat,
                      train_parch_feat, train_fare_feat))
test_feat =np.hstack((test_sex_feat.toarray(), test_cabin_feat.toarray(),test_embarked_feat.toarray(),
                      test_ticket_feat.toarray(), test_pclass_feat, test_age_feat, test_slisp_feat,
                      test_parch_feat, test_fare_feat))

scaler = StandardScaler()
train_feat_scaler = scaler.fit_transform(train_feat)
test_feat_scaler = scaler.transform(test_feat)

sel = VarianceThreshold(threshold=.1)
train_feat_scaler_sel = sel.fit_transform(train_feat_scaler)
test_feat_scaler_sel = sel.transform(test_feat_scaler)

pca = PCA(n_components=.95)
train_feat_scaler_sel_pca = pca.fit_transform(train_feat_scaler_sel)
test_feat_scaler_sel_pca = pca.transform(test_feat_scaler_sel)

survived_label = LabelEncoder()
survived_label.fit_transform(full['Survived'].values)
y_train = full['Survived'][:891].values
y_test = full['Survived'][891:].values

#3、建立svm模型，输出结果
clf = svm.SVC()
clf.fit(train_feat_scaler_sel_pca,y_train)
print(clf.score(train_feat_scaler_sel_pca, y_train))
test_data['last'] = clf.predict(test_feat_scaler_sel_pca)
test_data.to_csv('/Users/huangzhen/Desktop/titanic/last.csv')

最后跑分：

大浪淘沙_

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
kaggle入门比赛-泰坦尼克号生存预测（python）

数据集获取路径：https://www.kaggle.com/c/titanic/data不同的人在选取特征时会有不同的取舍，就此数据集，我认为所有特征都会影响最终结果，故保存了所有特征，即便有的特征的空值极多。import pandas as pdimport numpy as npimport osimport matplotlib.pyplot as pltimport refrom sklearn.preprocessing import LabelEncoder,One.
复制链接

扫一扫