想法1:能否用某个算法求个权值矩阵,后用其.x数据,再用k-means聚类
想法2 :直接先用逻辑回归,随机森林
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import warnings
warnings. filterwarnings( 'ignore' )
train_data = pd. read_csv( 'train.csv' )
test_data = pd. read_csv( 'test.csv' )
train_data. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train_data. info( )
print ( "-" * 40 )
test_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_data. describe( )
PassengerId Survived Pclass Age SibSp Parch Fare count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000 mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208 std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429 min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400 50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000 max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
性别对存活率影响
train_data[ 'Survived' ] . value_counts( ) . plot. pie( autopct = '%1.2f%%' )
<matplotlib.axes._subplots.AxesSubplot at 0x1acc7eab518>
男性人数是女性两倍,但女性存活率大很多
train_data. groupby( [ 'Sex' , 'Survived' ] ) . size( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acc9f482b0>
train_data[ [ 'Sex' , 'Survived' ] ] . groupby( [ 'Sex' ] ) . sum ( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acc9fa5b70>
train_data[ [ 'Sex' , 'Survived' ] ] . groupby( [ 'Sex' ] ) . mean( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acca00aa58>
船舱等级似乎影响更大,3等舱人最多,Survived最少
train_data. groupby( [ 'Pclass' , 'Sex' ] ) . size( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acca081f28>
train_data[ [ 'Pclass' , 'Survived' ] ] . groupby( [ 'Pclass' ] ) . sum ( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acca0e9400>
train_data[ [ 'Pclass' , 'Survived' ] ] . groupby( [ 'Pclass' ] ) . mean( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acca162470>
不同船舱的女性存活率有所区别
train_data[ [ 'Sex' , 'Pclass' , 'Survived' ] ] . groupby( [ 'Pclass' , 'Sex' ] ) . mean( ) . plot. bar( )
<matplotlib.axes._subplots.AxesSubplot at 0x1acca1bd1d0>
年龄与存活的关系
fig, ax = plt. subplots( 1 , 2 , figsize = ( 18 , 8 ) )
sns. violinplot( 'Pclass' , 'Age' , hue= 'Survived' , data= train_data, split= True , ax= ax[ 0 ] )
ax[ 0 ] . set_title( 'Pclass and Age vs Survived' )
ax[ 0 ] . set_yticks( range ( 0 , 100 , 10 ) )
sns. violinplot( 'Sex' , 'Age' , hue= 'Survived' , data = train_data, split= True , ax= ax[ 1 ] )
ax[ 1 ] . set_title( 'Sex and Age vs Survived' )
ax[ 1 ] . set_yticks( range ( 0 , 100 , 10 ) )
plt. show( )
年龄分布特征分析,拼错单词难受呀
在15岁之前存活率较高,40以后没区别
fig, ax = plt. subplots( figsize= ( 10 , 5 ) )
sns. kdeplot( train_data. loc[ ( train_data[ 'Survived' ] == 0 ) , 'Age' ] , shade = True , color = 'gray' , label = 'Not Survived' )
sns. kdeplot( train_data. loc[ ( train_data[ 'Survived' ] == 1 ) , 'Age' ] , shade= True , color = 'g' , label = 'Survived' )
plt. title( 'Age--Survived or NOt' )
plt. xlabel( 'Age' )
Text(0.5, 0, 'Age')
登陆港口与存活与否的关系,s港口登陆的人最多,获救率最低,c,q港口大多为女性登陆?
grid = sns. FacetGrid( data = train_data, col= 'Pclass' , hue= 'Sex' )
grid. map ( sns. countplot, 'Embarked' )
grid. add_legend( )
<seaborn.axisgrid.FacetGrid at 0x1acca219668>
sns. countplot( 'Embarked' , hue= 'Survived' , data = train_data)
plt. title( 'Embarked and Survived' )
Text(0.5, 1.0, 'Embarked and Survived')
sns. factorplot( 'Embarked' , 'Survived' , data = train_data, kind= 'bar' )
plt. title( 'Embarked and Survived rate' )
Text(0.5, 1.0, 'Embarked and Survived rate')
第一次简易处理数据,只填充年龄,选取几个简单特征
train_data[ "Age" ] = train_data[ "Age" ] . fillna( train_data[ "Age" ] . median( ) )
train_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
from sklearn. linear_model import LinearRegression
from sklearn. model_selection import KFold
predictors = [ "Pclass" , "Age" , "SibSp" , "Parch" , "Fare" ]
alg = LinearRegression( )
kf = KFold( n_splits= 3 , shuffle= False , random_state= 1 )
predictions = [ ]
for train, test in kf. split( train_data) :
train_predictors = ( train_data[ predictors] . iloc[ train, : ] )
train_target = ( train_data[ "Survived" ] . iloc[ train] )
alg. fit( train_predictors, train_target)
test_predictions = alg. predict( train_data[ predictors] . iloc[ test, : ] )
predictions. append( test_predictions)
print ( predictions[ 0 ] [ 1 : 10 ] , predictions[ 1 ] [ 1 : 10 ] , predictions[ 2 ] [ 1 : 10 ] , '\n' * 2 , len ( predictions) , len ( predictions[ 0 ] ) )
[0.64716068 0.22381187 0.65781892 0.15821019 0.20954606 0.54764008
0.35828968 0.29636233 0.54633321] [0.62891234 0.77642663 0.22950758 0.13992775 0.29488562 0.42050708
0.22989295 1.06210609 0.73216932] [0.22171068 0.46757728 0.11475416 0.26813918 0.47166107 0.45771509
0.26832766 0.66241433 0.15305294]
3 297
predictions = np. concatenate( predictions, axis= 0 )
predictions[ predictions > 0.5 ] = 1
predictions[ predictions <= 0.5 ] = 0
accuracy = sum ( predictions == train_data[ "Survived" ] ) / len ( predictions)
print ( "准确率:" , accuracy)
准确率: 0.7037037037037037
增加Age、Embarked、Cabin特征值,提高0.08
train_data. loc[ train_data[ 'Sex' ] == 'male' , 'Sex' ] = 0
train_data. loc[ train_data[ 'Sex' ] == 'female' , 'Sex' ] = 1
train_data[ 'Embarked' ] = train_data[ 'Embarked' ] . fillna( 'C' )
train_data[ 'Cabin' ] = train_data. Cabin. fillna( 'U0' )
train_data. loc[ train_data[ 'Embarked' ] == 'U0' , 'Embarked' ] = 0
train_data. loc[ train_data[ 'Embarked' ] == 'S' , 'Embarked' ] = 1
train_data. loc[ train_data[ 'Embarked' ] == 'C' , 'Embarked' ] = 2
train_data. loc[ train_data[ 'Embarked' ] == 'Q' , 'Embarked' ] = 3
train_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 891 non-null object
Embarked 891 non-null int64
dtypes: float64(2), int64(7), object(3)
memory usage: 83.6+ KB
predictors_2 = [ 'Pclass' , 'Sex' , 'Age' , 'SibSp' , 'Parch' , 'Fare' , 'Embarked' ]
alg_2 = LinearRegression( )
kf_2 = KFold( n_splits= 3 , shuffle= False , random_state= 1 )
predictions_2 = [ ]
for train, test in kf_2. split( train_data) :
train_predictors_2 = ( train_data[ predictors_2] . iloc[ train, : ] )
train_target_2 = ( train_data[ "Survived" ] . iloc[ train] )
alg_2. fit( train_predictors_2, train_target_2)
test_predictions_2 = alg_2. predict( train_data[ predictors_2] . iloc[ test, : ] )
predictions_2. append( test_predictions_2)
predictions_2 = np. concatenate( predictions_2, axis= 0 )
predictions_2[ predictions_2 > 0.5 ] = 1
predictions_2[ predictions_2 <= 0.5 ] = 0
accuracy_2 = sum ( predictions_2 == train_data[ "Survived" ] ) / len ( predictions_2)
print ( "准确率:" , accuracy_2)
准确率: 0.7833894500561167
测试集处理
test_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
test_data[ "Age" ] = test_data[ "Age" ] . fillna( train_data[ "Age" ] . median( ) )
test_data. loc[ test_data[ 'Sex' ] == 'male' , 'Sex' ] = 0
test_data. loc[ test_data[ 'Sex' ] == 'female' , 'Sex' ] = 1
train_data[ 'Embarked' ] = train_data[ 'Embarked' ] . fillna( 'C' )
test_data[ 'Cabin' ] = test_data. Cabin. fillna( 'U0' )
test_data. loc[ test_data[ 'Embarked' ] == 'U0' , 'Embarked' ] = 0
test_data. loc[ test_data[ 'Embarked' ] == 'S' , 'Embarked' ] = 1
test_data. loc[ test_data[ 'Embarked' ] == 'C' , 'Embarked' ] = 2
test_data. loc[ test_data[ 'Embarked' ] == 'Q' , 'Embarked' ] = 3
test_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null int64
Age 418 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 418 non-null object
Embarked 418 non-null int64
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB
test_data[ "Fare" ] = test_data[ "Fare" ] . fillna( train_data[ "Fare" ] . median( ) )
test_data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null int64
Age 418 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 418 non-null float64
Cabin 418 non-null object
Embarked 418 non-null int64
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB
test_features = [ 'Pclass' , 'Sex' , 'Age' , 'SibSp' , 'Parch' , 'Fare' , 'Embarked' ]
test_data[ 'Survived' ] = - 1
test_predictors = test_data[ test_features]
test_data[ 'Survived' ] = alg_2. predict( test_predictors)
test_data. head( )
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Survived 0 892 3 Kelly, Mr. James 0 34.5 0 0 330911 7.8292 U0 3 0.158051 1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 47.0 1 0 363272 7.0000 U0 1 0.480204 2 894 2 Myles, Mr. Thomas Francis 0 62.0 0 0 240276 9.6875 U0 3 0.177382 3 895 3 Wirz, Mr. Albert 0 27.0 0 0 315154 8.6625 U0 1 0.106463 4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 22.0 1 1 3101298 12.2875 U0 1 0.617975
test_data. loc[ test_data[ 'Survived' ] > 0.5 , 'Survived' ] = 1
test_data. loc[ test_data[ 'Survived' ] <= 0.5 , 'Survived' ] = 0
test_data. head( )
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Survived 0 892 3 Kelly, Mr. James 0 34.5 0 0 330911 7.8292 U0 3 0.0 1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 47.0 1 0 363272 7.0000 U0 1 0.0 2 894 2 Myles, Mr. Thomas Francis 0 62.0 0 0 240276 9.6875 U0 3 0.0 3 895 3 Wirz, Mr. Albert 0 27.0 0 0 315154 8.6625 U0 1 0.0 4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 22.0 1 1 3101298 12.2875 U0 1 1.0
submission = pd. DataFrame( {
'PassengerId' : test_data[ 'PassengerId' ] ,
'Survived' : test_data[ 'Survived' ]
} )
submission. head( )
PassengerId Survived 0 892 0.0 1 893 0.0 2 894 0.0 3 895 0.0 4 896 1.0
submission. describe( )
PassengerId Survived count 418.000000 418.000000 mean 1100.500000 0.358852 std 120.810458 0.480238 min 892.000000 0.000000 25% 996.250000 0.000000 50% 1100.500000 0.000000 75% 1204.750000 1.000000 max 1309.000000 1.000000
submission. to_csv( 'titanic_submission.csv' , index = False )