一、查看数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 导入数据
data=pd.read_csv(r"C:\英雄时刻\Python\Titanic_Rescue_Prediction-master\train.csv")
data.head()
二、处理缺失值
# 看看有木有缺失值
data.isnull().sum()
data.describe()
#用众数填充这一缺失值
data.Embarked.fillna(data['Embarked'].mode())
#用'U0'填充这一缺失值
train_data['Cabin'] = data.Cabin.fillna('U0')
#用随机森林处理年龄这一缺失值
from sklearn.ensemble import RandomForestRegressor
age_df = data[['Age','Survived','Fare', 'Parch', 'SibSp', 'Pclass']]
age_df_notnull = age_df.loc[(data['Age'].notnull())] #提取“age”不为空的所有数据
age_df_isnull = age_df.loc[(train_data['Age'].isnull())] #提取“age”为空的所有数据
X = age_df_notnull.values[:,1:] #x就是除了“age”以外的所有行
Y = age_df_notnull.values[:,0 #y就是“age”行
# use RandomForestRegression to train data
RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RFR.fit(X,Y)
predictAges = RFR.predict(age_df_isnull.values[:,1:]) #用非空“age”预测“age”的空值
train_data.loc[train_data['Age'].isnull(), ['Age']]= predictAges
三、数据转换
data['Initial']=0
for i in data:
data['Initial'].unique() #提取 'Initial'中的所有X
data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True) #替换原来的值
data['Fare_Range']=pd.qcut(data['Fare'],4) #对'Fare'这一连续值进行切分
data.Fare_Range.head()
data.groupby(['Fare_Range'])["Survived"].mean() #求每段价格区间的 "Survived"数
data['fare_cut']=0
data.loc[data['Fare']<=7.91,'fare_cut']=0 #‘Fare’切分为四段7.91,14.454,31.0,512.329
data.loc[(data['Fare']>7.91)&(data['Fare']<=14.454),'fare_cut']=1 #‘&’是且的意思,这里是数据转换
data.loc[(data['Fare']>14.454)&(data['Fare']<=31.0),'fare_cut']=2
data.loc[(data['Fare']>31.0)&(data['Fare']<=512.329),'fare_cut']=3
data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['Initial'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)
data.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'],axis=1,inplace=True)#删除原来的数据
data.drop(['Fare_range','alone'],axis=1,inplace=True)
四、建模
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x=data.iloc[:,1:-1] #用索引方式提取X,y
y=data.iloc[:,0]
x_train,x_text,y_train,y_text=train_test_split(x,y,test_size=0.3) #切分数据集
model=RandomForestClassifier(n_estimators=100) #随机森林
model.fit(x_train,y_train)
model.score(x_text,y_text