泰坦尼克号船员获救预测
第一步 导入第三方库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
import seaborn as sns
from sklearn import datasets
from sklearn import impute
from sklearn.preprocessing import StandardScaler ,PolynomialFeatures #导入预处理.标准化缩放,多项式&交互特征
from sklearn.decomposition import PCA # 导入分解.PCA
from sklearn.pipeline import make_pipeline,Pipeline # 导入流水线.生成流水线
from sklearn.model_selection import KFold , cross_val_score,GridSearchCV # 导入模型选择.K折交叉验证、交叉验证得分、网格搜索
from sklearn.linear_model import LogisticRegression ,LinearRegression # 逻辑回归,线性回归
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.neighbors import KNeighborsClassifier # KNN分类器
from sklearn.svm import SVC #支持向量机
from sklearn.naive_bayes import GaussianNB #高斯贝叶斯
from sklearn.metrics import roc_curve , roc_auc_score ,accuracy_score ,classification_report,make_scorer,confusion_matrix
#导入roc曲线,auc得分,准确率函数,分类器性能报告 ,自定义得分函数,混淆矩阵
第二步加载数据
数据路径要修改一下,才适合你的电脑
titanic = pd.read_csv(r'../train.csv')
test = pd.read_csv(r'../test.csv')
y_true = pd.read_csv(r'../gendermodel.csv',index_col = 0)
定义缺失值处理函数,方便同时操作训练集和测试集
print('----------------------------------------缺失值处理函数------------------------------------------------------')
#插值函数,传入表、列名、插值方式,返回插值结果
def simple_impute(df,col,stat):
from sklearn import impute
null_row = df[col].isnull()#记下空缺值
_ = impute.SimpleImputer(missing_values=np.nan,
strategy= stat,
copy=False)
_.fit_transform(df[col].values.reshape(-1,1))
return df[col][null_row] #返回插补效果
#创建一个KNN插值函数
def knn_imputer(df,cols,col):
from sklearn import impute
df['is_imputer'+ col] = np.where(df[col].isnull(),1,0) #新建一列,被插补的行标1
s_ = StandardScaler() #先搞一个缩放器
X = s_.fit_transform(df[cols]) #把特征缩放
in_ = impute.KNNImputer(n_neighbors=10, copy=True)#创建插补器
in_X = in_.fit_transform(X) #执行插补
result = s_.inverse_transform(in_X) #插补后的特征逆缩放
df[col] = pd.DataFrame(result,columns = cols)[col] #用插补后的列覆盖原列
return df[df['is_imputer'+ col] == 1] #返回插补效果
定义关于特征提取、变换的函数(其中有一些没用上)
print('----------------------------------------特征工程函数------------------------------------------------------')
#提取姓名中的头衔和婚姻状况
def extract_name_lable(df):
title_map = {
'Mr':0,'Miss':0,'Mrs':0,}
unmarried_map = {
'Miss':1,'Ms':1,'Mlle':1,}
title = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0].str