在Kaggle练习赛上的一题关于Titannic号上遇难者的预测问题,具体信息挂在下面参考
我的方法是采用SVM算法来做,不过得对获取的数据进行处理。其中Kaggle官方给了train.csv和test.csv以及一个参考提交文件。参考了Kaggle上一位大佬的做法,最后做出来的预测率为81%左右,虽然比较一般,但我感觉第一次这样还行。
下面附代码,具体文件可以去官方Kaggle或者下面下载。(代码中关键处都有相关的注释,predict.csv是我自己做出来的预测)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
pd.set_option('precision', 3)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 15
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import time
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
#读取数据
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_id = test_df['PassengerId'].values
#处理数据
all_data = pd.concat([train_df, test_df])
all_data['Survived'].value_counts(dropna = False) #dropna=Flase 代表不丢弃含空行的列
all_data['Age'] = all_data['Age'].fillna(train_df['Age'].median())
all_data['Fare'] = all_data['Fare'].fillna(train_df['Fare'].median())
all_data['Last_Name'] = all_data['Name'].apply(lambda x: str.split(x, ',')[0])
all_data['Fare'].fillna(all_data['Fare'].mean(), inplace = True)
default_sr_value = 0.5
all_data['Family_Survival'] = default_sr_value
for grp, grp_df in all_data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId', 'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
#该段代码意义在于寻找相同的家人,相同的Last name和Fare
if (len(grp_df) != 1): # A Family group is found.
for ind, row in grp_df.iterrows():
smax = grp_df.drop(ind)['Survived'].max()
smin = grp_df.drop(ind)['Survived'].min()
passID = row['PassengerId']
if (smax == 1.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
elif (smin==0.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
for _, grp_df in all_data.groupby('Ticket'):
if (len(grp_df) != 1):
for ind, row in grp_df.iterrows():
if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
smax = grp_df.drop(ind)['Survived'].max()
smin = grp_df.drop(ind)['Survived'].min()
passID = row['PassengerId']
if (smax == 1.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
elif (smin==0.0):
all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
#对数据进行编码
#####################################################################################
all_data['Age_Bin'] = (all_data['Age']//15)*15
all_data['Fare_Bin'] = pd.qcut(all_data['Fare'], 5)
all_data['Relatives'] = all_data['SibSp'] + all_data['Parch']
#####################################################################################
all_data['Title'] = all_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
all_data['Title'].replace({'Mlle':'Miss', 'Mme':'Mrs', 'Ms':'Miss'}, inplace = True)
all_data['Title'].replace(['Don', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 'the Countess', 'Jonkheer', 'Dona'],
'Rare Title', inplace = True)
all_data['Fare_Bin'] = LabelEncoder().fit_transform(all_data['Fare_Bin'])
all_data['Age_Bin'] = LabelEncoder().fit_transform(all_data['Age_Bin'])
all_data['Title_Bin'] = LabelEncoder().fit_transform(all_data['Title'])
all_data['Sex'] = LabelEncoder().fit_transform(all_data['Sex'])
all_data.drop(['PassengerId', 'Age', 'Fare', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Title', 'Last_Name', 'Embarked'], axis = 1, inplace = True)
#最后得到处理好的数据,也包括测试的数据
train_df = all_data[:891]
X_train = train_df.drop('Survived', 1)
y_train = train_df['Survived']
# #######################################################
test_df = all_data[891:]
X_test = test_df.copy()
X_test.drop('Survived', axis = 1, inplace = True)
#标准化
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train) # fit_transform the X_train
X_test_scaled = std_scaler.transform(X_test) # only transform the X_test
cv_means_tuned = [np.nan] # we can't actually tune the GNB classifier, so we fill its element with NaN
#simple performance reporting function
def clf_performance(classifier, model_name):
print(model_name)
print('-------------------------------')
#print(' Score ' + str(classifier.grid_scores_))
print(' Best Score: ' + str(classifier.best_score_))
print(' Best Parameters: ' + str(classifier.best_params_))
cv_means_tuned.append(classifier.best_score_)
# svc = SVC(probability = True,kernel='rbf',gamma=0.1,C=1)
# clf = svc.fit(X_train_scaled,y_train)
# result = clf.predict(X_train_scaled)
# z =result-y_train.values
# print(str(round(list(z).count(0)/len(result),2)*100)+'%')
svc = SVC(probability = True)
#测试最佳参数为C=1,gamma=0.1 kernel=rbf
#精度在0.85
param_grid = tuned_parameters = [{'kernel': ['rbf'],
'gamma': [0.01, 0.1, 0.5, 1, 2, 5],
'C': [.1, 1, 2, 5]},
{'kernel': ['linear'],
'C': [.1, 1, 2, 10]},
{'kernel': ['poly'],
'degree' : [2, 3, 4, 5],
'C': [.1, 1, 10]}]
#网格搜索
#绘图观察
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = False, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled, y_train)
predict = best_clf_svc.predict(X_test_scaled)
predict = pd.DataFrame({'Survived':predict})
predict.insert(0,'PassengerId',test_id)
predict.to_csv('predict.csv')