# 泰坦尼克号幸存预测

## 一、导入数据

import pandas as pd
import numpy as np
import random



#查看数据统计性信息

train.describe()


#查看数据的缺失情况

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Column       Non-Null Count  Dtype

0   PassengerId  891 non-null    int64
1   Survived     891 non-null    int64
2   Pclass       891 non-null    int64
3   Name         891 non-null    object
4   Sex          891 non-null    object
5   Age          714 non-null    float64
6   SibSp        891 non-null    int64
7   Parch        891 non-null    int64
8   Ticket       891 non-null    object
9   Fare         891 non-null    float64
10  Cabin        204 non-null    object
11  Embarked     889 non-null    object
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB


## 二、数据处理，包括补充空值等

1. train.isnull().any() #True为缺失数据
PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

1. 对缺失年龄进行填充，并且将进行分组以便达到数字离散化的结果
def simplify_ages(df):
df['Age']=df['Age'].fillna(df['Age'].mean())#把缺失值补上，方便分组
bins = (0, 5, 12, 18, 25, 35, 60, 120) #把Age分为不同区间,0-5,6-12...,60以上,放到bins里，八个区间，对应的八个区间名称在group_names
#使用pandas.cut对数据进行离散化，
catagories = pd.cut(df.Age, bins, labels=group_names)
df.Age = catagories
return df

1. 填充并简化Cabin
因为Cabin的字母后面的数字意义不大，故取字母即可
def simplify_cabin(df):
df.Cabin = df.Cabin.fillna('N')
df.Cabin = df.Cabin.apply(lambda x:x[0])
return df

1. 填充Embarked
def full_Embarked(df):
df['Embarked']=df['Embarked'].fillna(''.join(random.sample(["C",'S','Q'],1)))
return df

1. 对票价进行离散化
def spread_fare(df):
bins = (0, 8, 15, 31, 1000)
group_names = ['firse', 'second', 'thrid', 'fourth']
df.Farepd.cut(df.Fare,bins,right=False,labels=group_names)
return df

1. 删除无关因素
def simplify_drop(df):
return df.drop(['PassengerId ','Name','Ticket'],axis=1)

1. 整合一遍，凑成新表
def transform_features(df):
df = simplify_ages(df)
df = simplify_cabin(df)
df =full_Embarked(df)
df = simplify_drop(df)
return df


#执行读取新表
#必须要再读取一遍原来的表，不然会报错,不仅训练集要简化，测试集也要，两者的特征名称要一致

train = pd.read_csv('titanic_csv/train.csv')
train = transform_features(train)
test = transform_features(test)


1. 非数值型数据要转化成数值型
用数组特征化编码年龄、性别、舱位，目的地和票价区间等等，因为随机森林的输入需要数值，字符不行
训练集数据转化
> train['Sex']= train['Sex'].map({'female':0,'male': 1}).astype(int)
> train['Age']= train['Age'].map({'Baby':0, 'Child':
> 1,'Teenager':2,'Student':3,'Young
> train['Embarked'].map({'S':0, 'C': 1,'Q':2}).astype(int)
> train['Cabin']= train['Cabin'].map({'A':0, 'B':
> 1,'C':2,'D':3,'E':4,'F':5,'G':6,'T':7,'N':8}).astype(int)
> train['Fare']= train['Fare'].map({'first':0, 'second':
> 1,'third':2,'fourth':3}).astype(int)


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Column    Non-Null Count  Dtype

0   Survived  891 non-null    int64
1   Pclass    891 non-null    int64
2   Sex       891 non-null    int32
3   Age       891 non-null    int32
4   SibSp     891 non-null    int64
5   Parch     891 non-null    int64
6   Fare      891 non-null    int32
7   Cabin     891 non-null    int32
8   Embarked  891 non-null    int32
dtypes: int32(5), int64(4)
memory usage: 45.3 KB


cols =['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
data_te=test[cols].copy()
data_te['Sex']= data_te['Sex'].map({'female':0,'male': 1}).astype(int)
data_te['Embarked']= data_te['Embarked'].map({'S':0, 'C': 1,'Q':2}).astype(int)
data_te['Cabin']= data_te['Cabin'].map({'A':0, 'B': 1,'C':2,'D':3,'E':4,'F':5,'G':6,'N':8}).astype(int)
data_te['Fare']= data_te['Fare'].map({'first':0, 'second': 1,'third':2,'fourth':3}).astype(int)


## 三、构建随机森林

1. 用百分之20作为测试集
from sklearn.model_selection import train_test_split
X = train.drop(['Survived'],axis=1) #自变量
y = train['Survived'] #因变量
p = 0.2 #用百分之20作为测试集
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=p, random_state=23)

1. 定义决策树的参数
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
clf = RandomForestClassifier()

#可以通过定义树的各种参数，限制树的大小，防止出现过拟合现象parameters = {'n_estimators': [10,100,200],
# 'n_estimators': [4, 6, 9],
'max_features': ['log2', 'sqrt','auto'],
'criterion': ['entropy', 'gini'],        #分类标准用熵，基尼系数
'max_depth': [2, 3, 5, 10],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1,5,8]
}

#以下是用于比较参数好坏的评分，使用'make_scorer'将'accuracy_score'转换为评分函数
acc_scorer = make_scorer(accuracy_score)

1. 使用GridSearchC调参，找出最优参数并用于训练

GridSearchCV，网格搜索+交叉认证，它存在的意义就是自动调参，只要把参数输进去，就能给出最优化的结果和参数
#GridSearchCV用于系统地遍历多种参数组合，通过交叉验证确定最佳效果参数。

grid_obj = GridSearchCV(clf,parameters,scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train,y_train)
#将clf设置为参数的最佳组合
clf = grid_obj.best_estimator_
grid_obj.best_params_
#将最佳算法运用于数据中
clf.fit(X_train,y_train)
print("随机森林训练数据正确率：%.2f" % (clf.fit(X_train,y_train).accuracy_score(X_train,y_train)))


1. 使用那20%的数据测试预测准确度
predictions = clf.predict(X_test)
accuracy_score(y_test,predictions)  #结果为0.823423

1. 得出训练集存活情况，并且保存数据到训练集里。
predictions = clf.predict(data_te.drop('PassengerId',axis=1))
output = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})
output.to_csv('test.csv')