数据集获取路径:https://www.kaggle.com/c/titanic/data
不同的人在选取特征时会有不同的取舍,就此数据集,我认为所有特征都会影响最终结果,故保存了所有特征, 即便有的特征的空值极多。
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore", category=Warning)
from sklearn.feature_selection import VarianceThreshold
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn import naive_bayes
#1、数据准备+数据处理
test_data = pd.read_csv('/Users/huangzhen/Desktop/titanic/test.csv')
train_data = pd.read_csv('/Users/huangzhen/Desktop/titanic/train.csv')
#将训练集和测试集整合一起进行数据处理
full = pd.concat([train_data, test_data], ignore_index=True)
#查看是否有空值
# print(full.isnull().sum())
#去掉特殊符号
full['split_name'] = full['Name'].apply(lambda x:x.split(',')[1].split('.')[0])
# print(full['split_name'].unique())
#age为空的split_name
a = full.loc[full['Age'].isnull(), 'split_name']
# print(a)
# print(a.unique())
#根据split_name,补充对应的age的均值
for i in a:
full['Age'].fillna(full[full['split_name']==i]['Age'].mean(), inplace=True)
# print(full[full['Fare'].isnull()]['Ticket'])
full['Fare'].fillna(full[(full['Embarked'])=='S']['Fare'].mean(), inplace=True)
# print(full[full['Ticket']==3701]['Fare'].values)
# print(full['Embarked'].value_counts())
full['Embarked'].fillna('S', inplace=True)
# print(full.isnull().sum())
# print(full[full['Pclass']==0]['Cabin'].value_counts()) #3:g,f,e 2:f,d,e 1:c,b,d,e 6种3个等级(a,t)
#根据pclass不同,随机填充cabin的等级
for i in full['Pclass']:
if i ==3:
full['Cabin'].fillna(np.random.choice(['g', 'f']),inplace=True)
elif i ==2:
full['Cabin'].fillna(np.random.choice(['d', 'e']),inplace=True)
else:
full['Cabin'].fillna(np.random.choice(['b','c']),inplace=True)
# print(full.isnull().sum())
full['Cabin'] = [x[0] for x in full['Cabin']]
# print(full.isnull().sum())
full['Cabin'] = [x.upper() for x in full['Cabin']]
# print(full['Cabin'].unique())
# print(full[:891])
#2、特征工程,对所有特征进行编码,归一,降维
sex_label = LabelEncoder()
sex_label_coder = sex_label.fit_transform(full['Sex'].values)
sex_onehot = OneHotEncoder()
sex_label_onehot = sex_onehot.fit_transform(sex_label_coder.reshape(-1,1))
train_sex_feat = sex_label_onehot[:891]
test_sex_feat = sex_label_onehot[891:]
cabin_label = LabelEncoder()
cabin_label_coder = cabin_label.fit_transform(full['Cabin'].values)
cabin_onehot = OneHotEncoder()
cabin_label_onehot = cabin_onehot.fit_transform(cabin_label_coder.reshape(-1,1))
train_cabin_feat = cabin_label_onehot[:891]
test_cabin_feat = cabin_label_onehot[891:]
embarked_label = LabelEncoder()
embarked_label_coder = embarked_label.fit_transform(full['Embarked'].values)
embarked_onehot = OneHotEncoder()
embarked_label_onehot = embarked_onehot.fit_transform(embarked_label_coder.reshape(-1,1))
train_embarked_feat = embarked_label_onehot[:891]
test_embarked_feat = embarked_label_onehot[891:]
ticket_label = LabelEncoder()
ticket_label_coder = ticket_label.fit_transform(full['Ticket'].values)
ticket_onehot = OneHotEncoder()
ticket_label_onehot = ticket_onehot.fit_transform(ticket_label_coder.reshape(-1,1))
train_ticket_feat = ticket_label_onehot[:891]
test_ticket_feat = ticket_label_onehot[891:]
pclass_label_onehot = full['Pclass'].values.reshape(-1,1)
train_pclass_feat = pclass_label_onehot[:891]
test_pclass_feat = pclass_label_onehot[891:]
age_label_onehot = full['Age'].values.reshape(-1,1)
train_age_feat = age_label_onehot[:891]
test_age_feat = age_label_onehot[891:]
slisp_label_onehot = full['SibSp'].values.reshape(-1,1)
train_slisp_feat = slisp_label_onehot[:891]
test_slisp_feat = slisp_label_onehot[891:]
parch_label_onehot = full['Parch'].values.reshape(-1,1)
train_parch_feat = parch_label_onehot[:891]
test_parch_feat = parch_label_onehot[891:]
fare_label_onehot = full['Fare'].values.reshape(-1,1)
train_fare_feat = fare_label_onehot[:891]
test_fare_feat = fare_label_onehot[891:]
#整合训练集,测试集特征
train_feat =np.hstack((train_sex_feat.toarray(), train_cabin_feat.toarray(),train_embarked_feat.toarray(),
train_ticket_feat.toarray(), train_pclass_feat, train_age_feat, train_slisp_feat,
train_parch_feat, train_fare_feat))
test_feat =np.hstack((test_sex_feat.toarray(), test_cabin_feat.toarray(),test_embarked_feat.toarray(),
test_ticket_feat.toarray(), test_pclass_feat, test_age_feat, test_slisp_feat,
test_parch_feat, test_fare_feat))
scaler = StandardScaler()
train_feat_scaler = scaler.fit_transform(train_feat)
test_feat_scaler = scaler.transform(test_feat)
sel = VarianceThreshold(threshold=.1)
train_feat_scaler_sel = sel.fit_transform(train_feat_scaler)
test_feat_scaler_sel = sel.transform(test_feat_scaler)
pca = PCA(n_components=.95)
train_feat_scaler_sel_pca = pca.fit_transform(train_feat_scaler_sel)
test_feat_scaler_sel_pca = pca.transform(test_feat_scaler_sel)
survived_label = LabelEncoder()
survived_label.fit_transform(full['Survived'].values)
y_train = full['Survived'][:891].values
y_test = full['Survived'][891:].values
#3、建立svm模型,输出结果
clf = svm.SVC()
clf.fit(train_feat_scaler_sel_pca,y_train)
print(clf.score(train_feat_scaler_sel_pca, y_train))
test_data['last'] = clf.predict(test_feat_scaler_sel_pca)
test_data.to_csv('/Users/huangzhen/Desktop/titanic/last.csv')
最后跑分: