泰坦尼克号幸存者预测
import numpy as np
import pandas as pd
数据读取
#最后一行是换行符不取
train=pd.read_csv(r'./mytrain.csv').iloc[:,:-1]
test=pd.read_csv(r'./mytest.csv').iloc[:,:-1]
#查看数据
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
查看数据特征
train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
可以看出训练集中age、Cabin、Embarked列有缺失这里直接删除Age列为NAN的行由于test数据中有些列缺失我们可以用线性回归填充需要将Age作为标签而不作为特征,由于Cabin这列缺失太多直接删除这一列,Embarked这缺失比较少用众数填充, PassengerId、Name、Ticket也不用考虑。
train.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
选取数据特征
选取特征即删除PassengerId、Name、Ticket、Cabin
train.drop(['PassengerId','Name','Ticket','Cabin'],inplace=True,axis=1)
test.drop(['PassengerId','Name','Ticket','Cabin'],inplace=True,axis=1)
查看Embarked列的频数找出众数填充
train['Embarked'].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
train['Embarked'].fillna(value='S',inplace=True)
只有年龄还有Nan值直接删除
train.dropna(inplace=True)
#可以看出训练集中age、Fare列有缺失
test.describe()
Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|
count | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
std | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
min | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
max | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
对test数据操作填充Fare(均值),Age等下用线性回归填充
test['Fare'].fillna(test['Fare'].mean(),inplace=True)
数据编码
对非数值型数据转换为数值
Sex列转换
train.loc[train['Sex']=='male','Sex']=1#male转换为1
train.loc[train['Sex']=='female','Sex']=0
test.loc[test['Sex']=='male','Sex']=1
test.loc[test['Sex']=='female','Sex']=0
Embarked转换
train.loc[train.Embarked=='S', 'Embarked'] = 0#Embarked=='S'转换为0
train.loc[train.Embarked=='C', 'Embarked'] = 1
train.loc[train.Embarked=='Q', 'Embarked'] = 2
test.loc[test.Embarked=='S', 'Embarked'] = 0
test.loc[test.Embarked=='C', 'Embarked'] = 1
test.loc[test.Embarked=='Q', 'Embarked'] = 2
异常值处理
先取出标签Survived,删除标签。然后遍历train中每个cell的数据,小于或大于一定cell所在列的数就标记这列存在为异常值,然后根据标记删除train中数据,就可以作为模型的训练数据。
先取出标签Survived,删除标签。
lables_Survived=train['Survived']
train.drop('Survived',axis=1,inplace=True)
train.shape
(714, 7)
构造标记列表元素个数与train的行数一样,如果train的一行中存在异常值就标记1
mark_erro=np.zeros(train.shape[0])
#mark_erro
train_erro=np.array(train)
Min=np.percentile(train_erro[:,1],50)
Min
1.0
遍历train中每个cell的数据,然后标记mark_erro
train_erro=np.array(train)#转换为矩阵方便遍历
for i in range(train.shape[1]):
Min=np.percentile(train_erro[:,i],8)#使用percentile需要传入array,第二个参数取array中第几%的数作为异常值的判断标准
Max=np.percentile(train_erro[:,i],92)
for j in range(train.shape[0]):
if train_erro[j,i]<Min or train_erro[j, i]>Max:
mark_erro[j]=1#标记异常
过滤异常值,选取mark_erro为0的行
x_true = train.loc[mark_erro==np.zeros(train.shape[0])]
y_true = lables_Survived.loc[mark_erro==np.zeros(train.shape[0])]
x_true.head()
Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|
1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 1 |
2 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | 0 |
3 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | 0 |
4 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | 0 |
8 | 3 | 0 | 27.0 | 0 | 2 | 11.1333 | 0 |
线性回归粗糙预测年龄
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import GridSearchCV#网格搜索优化参数
from sklearn.metrics import accuracy_score# 分类准确率分数逻辑回归预测分数
取出年龄标签并删除
x_true_age=x_true['Age']
x_true.drop('Age',inplace=True,axis=1)
E:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:3697: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
errors=errors)
线性回归模型
model_age_pre=LinearRegression()
model_age_pre.fit(x_true,x_true_age)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
预测分数R方预测
model_age_pre.score(x_true,x_true_age)
0.13195844479498142
预测test数据中缺失的Age,先取出Age缺失的行来操作,删除Age缺失的行得到正确数据,然后将处理过后Age缺失的行拼接到正确数据后面
test_Ageisnull=test.loc[test.Age.isnull()]
test_Ageisnull.head()
Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|
10 | 3 | 1 | NaN | 0 | 0 | 7.8958 | 0 |
22 | 1 | 0 | NaN | 0 | 0 | 31.6833 | 0 |
29 | 3 | 1 | NaN | 2 | 0 | 21.6792 | 1 |
33 | 3 | 0 | NaN | 1 | 2 | 23.4500 | 0 |
36 | 3 | 0 | NaN | 0 | 0 | 8.0500 | 0 |
test.isnull().sum()#显示只有age列为空删除age为空的
Pclass 0
Sex 0
Age 86
SibSp 0
Parch 0
Fare 0
Embarked 0
dtype: int64
test.dropna(inplace=True)
删除Age列作为数据给训练好的线性模型训练预测年龄
x_age_pre=test_Ageisnull.drop('Age',axis=1)
age_pre=model_age_pre.predict(x_age_pre)
test_Ageisnull['Age']=age_pre#预测出来的年龄加入数据
test.append(test_Ageisnull)#将处理好缺失年龄的数据重新加到test中
test.head()
E:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|
0 | 3 | 1 | 34.5 | 0 | 0 | 7.8292 | 2 |
1 | 3 | 0 | 47.0 | 1 | 0 | 7.0000 | 0 |
2 | 2 | 1 | 62.0 | 0 | 0 | 9.6875 | 2 |
3 | 3 | 1 | 27.0 | 0 | 0 | 8.6625 | 0 |
4 | 3 | 0 | 22.0 | 1 | 1 | 12.2875 | 0 |
model_Survived_pre=LogisticRegression()
param_grid = np.linspace(0.001,1000,10) #生成需要搜索的待选参数
param_grid = dict(C=param_grid)#调参模型需要传入字典格式的参数,C是LogisticRegression中的参数
param_grid
{'C': array([1.00000e-03, 1.11112e+02, 2.22223e+02, 3.33334e+02, 4.44445e+02,
5.55556e+02, 6.66667e+02, 7.77778e+02, 8.88889e+02, 1.00000e+03])}
设计调过参的逻辑回归模型,并喂入训练集
gri=GridSearchCV(model_Survived_pre,param_grid)
gri.fit(x_true,y_true)
GridSearchCV(cv=None, error_score='raise',
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False),
fit_params=None, iid=True, n_jobs=1,
param_grid={'C': array([1.00000e-03, 1.11112e+02, 2.22223e+02, 3.33334e+02, 4.44445e+02,
5.55556e+02, 6.66667e+02, 7.77778e+02, 8.88889e+02, 1.00000e+03])},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=0)
查看模型评价得分
gri.best_score_#好像有点低需要选择更优的参数或者模型
0.7828947368421053
直接使用逻辑回归预测不调参
model=LogisticRegression()
model.fit(x_true,y_true)
accuracy_score(model.predict(x_true), y_true)
0.8004385964912281