kaggle入门比赛-泰坦尼克号生存预测(python)

数据集获取路径:https://www.kaggle.com/c/titanic/data

不同的人在选取特征时会有不同的取舍,就此数据集,我认为所有特征都会影响最终结果,故保存了所有特征, 即便有的特征的空值极多。

 

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore", category=Warning)
from sklearn.feature_selection import VarianceThreshold
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import naive_bayes


#1、数据准备+数据处理
test_data = pd.read_csv('/Users/huangzhen/Desktop/titanic/test.csv')
train_data = pd.read_csv('/Users/huangzhen/Desktop/titanic/train.csv')
#将训练集和测试集整合一起进行数据处理
full = pd.concat([train_data, test_data], ignore_index=True)
#查看是否有空值
# print(full.isnull().sum())
#去掉特殊符号
full['split_name'] = full['Name'].apply(lambda x:x.split(',')[1].split('.')[0])
# print(full['split_name'].unique())
#age为空的split_name
a = full.loc[full['Age'].isnull(), 'split_name']
# print(a)
# print(a.unique())
#根据split_name,补充对应的age的均值
for i in a:
    full['Age'].fillna(full[full['split_name']==i]['Age'].mean(), inplace=True)
# print(full[full['Fare'].isnull()]['Ticket'])
full['Fare'].fillna(full[(full['Embarked'])=='S']['Fare'].mean(), inplace=True)
# print(full[full['Ticket']==3701]['Fare'].values)
# print(full['Embarked'].value_counts())
full['Embarked'].fillna('S', inplace=True)
# print(full.isnull().sum())
# print(full[full['Pclass']==0]['Cabin'].value_counts())  #3:g,f,e    2:f,d,e     1:c,b,d,e   6种3个等级(a,t)
#根据pclass不同,随机填充cabin的等级
for i in full['Pclass']:
    if i ==3:
        full['Cabin'].fillna(np.random.choice(['g', 'f']),inplace=True)
    elif i ==2:
        full['Cabin'].fillna(np.random.choice(['d', 'e']),inplace=True)
    else:
        full['Cabin'].fillna(np.random.choice(['b','c']),inplace=True)
# print(full.isnull().sum())
full['Cabin'] = [x[0] for x in full['Cabin']]
# print(full.isnull().sum())
full['Cabin'] = [x.upper() for x in full['Cabin']]
# print(full['Cabin'].unique())
# print(full[:891])

#2、特征工程,对所有特征进行编码,归一,降维
sex_label = LabelEncoder()
sex_label_coder = sex_label.fit_transform(full['Sex'].values)
sex_onehot = OneHotEncoder()
sex_label_onehot = sex_onehot.fit_transform(sex_label_coder.reshape(-1,1))
train_sex_feat = sex_label_onehot[:891]
test_sex_feat = sex_label_onehot[891:]

cabin_label = LabelEncoder()
cabin_label_coder = cabin_label.fit_transform(full['Cabin'].values)
cabin_onehot = OneHotEncoder()
cabin_label_onehot = cabin_onehot.fit_transform(cabin_label_coder.reshape(-1,1))
train_cabin_feat = cabin_label_onehot[:891]
test_cabin_feat = cabin_label_onehot[891:]

embarked_label = LabelEncoder()
embarked_label_coder = embarked_label.fit_transform(full['Embarked'].values)
embarked_onehot = OneHotEncoder()
embarked_label_onehot = embarked_onehot.fit_transform(embarked_label_coder.reshape(-1,1))
train_embarked_feat = embarked_label_onehot[:891]
test_embarked_feat = embarked_label_onehot[891:]

ticket_label = LabelEncoder()
ticket_label_coder = ticket_label.fit_transform(full['Ticket'].values)
ticket_onehot = OneHotEncoder()
ticket_label_onehot = ticket_onehot.fit_transform(ticket_label_coder.reshape(-1,1))
train_ticket_feat = ticket_label_onehot[:891]
test_ticket_feat = ticket_label_onehot[891:]

pclass_label_onehot = full['Pclass'].values.reshape(-1,1)
train_pclass_feat = pclass_label_onehot[:891]
test_pclass_feat = pclass_label_onehot[891:]

age_label_onehot = full['Age'].values.reshape(-1,1)
train_age_feat = age_label_onehot[:891]
test_age_feat = age_label_onehot[891:]

slisp_label_onehot = full['SibSp'].values.reshape(-1,1)
train_slisp_feat = slisp_label_onehot[:891]
test_slisp_feat = slisp_label_onehot[891:]

parch_label_onehot = full['Parch'].values.reshape(-1,1)
train_parch_feat = parch_label_onehot[:891]
test_parch_feat = parch_label_onehot[891:]

fare_label_onehot = full['Fare'].values.reshape(-1,1)
train_fare_feat = fare_label_onehot[:891]
test_fare_feat = fare_label_onehot[891:]

#整合训练集,测试集特征
train_feat =np.hstack((train_sex_feat.toarray(), train_cabin_feat.toarray(),train_embarked_feat.toarray(),
                      train_ticket_feat.toarray(), train_pclass_feat, train_age_feat, train_slisp_feat,
                      train_parch_feat, train_fare_feat))
test_feat =np.hstack((test_sex_feat.toarray(), test_cabin_feat.toarray(),test_embarked_feat.toarray(),
                      test_ticket_feat.toarray(), test_pclass_feat, test_age_feat, test_slisp_feat,
                      test_parch_feat, test_fare_feat))

scaler = StandardScaler()
train_feat_scaler = scaler.fit_transform(train_feat)
test_feat_scaler = scaler.transform(test_feat)

sel = VarianceThreshold(threshold=.1)
train_feat_scaler_sel = sel.fit_transform(train_feat_scaler)
test_feat_scaler_sel = sel.transform(test_feat_scaler)

pca = PCA(n_components=.95)
train_feat_scaler_sel_pca = pca.fit_transform(train_feat_scaler_sel)
test_feat_scaler_sel_pca = pca.transform(test_feat_scaler_sel)

survived_label = LabelEncoder()
survived_label.fit_transform(full['Survived'].values)
y_train = full['Survived'][:891].values
y_test = full['Survived'][891:].values

#3、建立svm模型,输出结果
clf = svm.SVC()
clf.fit(train_feat_scaler_sel_pca,y_train)
print(clf.score(train_feat_scaler_sel_pca, y_train))
test_data['last'] = clf.predict(test_feat_scaler_sel_pca)
test_data.to_csv('/Users/huangzhen/Desktop/titanic/last.csv')

最后跑分:

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值