import numpy as np
import pandas as pd
df = pd. read_csv( 'train.csv' )
print ( df. head( 3 ) )
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
df = pd. read_csv( 'train.csv' , names= [ '乘客ID' , '是否幸存' , '仓位等级' , '姓名' , '性别' , '年龄' , '兄弟姐妹个数' , '父母子女个数' , '船票信息' , '票价' , '客舱' , '登船港口' ] , index_col= '乘客ID' , header= 0 )
print ( df. head( 3 ) )
是否幸存 仓位等级 姓名 性别 \
乘客ID
1 0 3 Braund, Mr. Owen Harris male
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female
3 1 3 Heikkinen, Miss. Laina female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
1 22.0 1 0 A/5 21171 7.2500 NaN S
2 38.0 1 0 PC 17599 71.2833 C85 C
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
print ( df. info( ) )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 891 non-null int64
1 仓位等级 891 non-null int64
2 姓名 891 non-null object
3 性别 891 non-null object
4 年龄 714 non-null float64
5 兄弟姐妹个数 891 non-null int64
6 父母子女个数 891 non-null int64
7 船票信息 891 non-null object
8 票价 891 non-null float64
9 客舱 204 non-null object
10 登船港口 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
print ( df. head( 10 ) )
print ( df. tail( 10 ) )
是否幸存 仓位等级 姓名 性别 \
乘客ID
1 0 3 Braund, Mr. Owen Harris male
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female
3 1 3 Heikkinen, Miss. Laina female
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
5 0 3 Allen, Mr. William Henry male
6 0 3 Moran, Mr. James male
7 0 1 McCarthy, Mr. Timothy J male
8 0 3 Palsson, Master. Gosta Leonard male
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
1 22.0 1 0 A/5 21171 7.2500 NaN S
2 38.0 1 0 PC 17599 71.2833 C85 C
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 35.0 1 0 113803 53.1000 C123 S
5 35.0 0 0 373450 8.0500 NaN S
6 NaN 0 0 330877 8.4583 NaN Q
7 54.0 0 0 17463 51.8625 E46 S
8 2.0 3 1 349909 21.0750 NaN S
9 27.0 0 2 347742 11.1333 NaN S
10 14.0 1 0 237736 30.0708 NaN C
是否幸存 仓位等级 姓名 性别 年龄 \
乘客ID
882 0 3 Markun, Mr. Johann male 33.0
883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0
884 0 2 Banfield, Mr. Frederick James male 28.0
885 0 3 Sutehall, Mr. Henry Jr male 25.0
886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0
887 0 2 Montvila, Rev. Juozas male 27.0
888 1 1 Graham, Miss. Margaret Edith female 19.0
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN
890 1 1 Behr, Mr. Karl Howell male 26.0
891 0 3 Dooley, Mr. Patrick male 32.0
兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
882 0 0 349257 7.8958 NaN S
883 0 0 7552 10.5167 NaN S
884 0 0 C.A./SOTON 34068 10.5000 NaN S
885 0 0 SOTON/OQ 392076 7.0500 NaN S
886 0 5 382652 29.1250 NaN Q
887 0 0 211536 13.0000 NaN S
888 0 0 112053 30.0000 B42 S
889 1 2 W./C. 6607 23.4500 NaN S
890 0 0 111369 30.0000 C148 C
891 0 0 370376 7.7500 NaN Q
df. isnull( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 1 False False False False False False False False False True False 2 False False False False False False False False False False False 3 False False False False False False False False False True False 4 False False False False False False False False False False False 5 False False False False False False False False False True False ... ... ... ... ... ... ... ... ... ... ... ... 887 False False False False False False False False False True False 888 False False False False False False False False False False False 889 False False False False True False False False False True False 890 False False False False False False False False False False False 891 False False False False False False False False False True False
891 rows × 11 columns
df. to_csv( 'tain_cn.csv' )
print ( df. sort_values( by= [ '票价' , '年龄' ] , ascending= False ) . head( 10 ) )
'''
根据常识我知道发现票价越高的应该客舱越好,
所以我们会明显看出,票价前10的乘客中存活的有8人,
这是相当高的一个比例,后期可以尝试分析票价和年龄的关系,票价和存活率的关系
'''
是否幸存 仓位等级 姓名 性别 \
乘客ID
680 1 1 Cardeza, Mr. Thomas Drake Martinez male
259 1 1 Ward, Miss. Anna female
738 1 1 Lesurer, Mr. Gustave J male
439 0 1 Fortune, Mr. Mark male
342 1 1 Fortune, Miss. Alice Elizabeth female
89 1 1 Fortune, Miss. Mabel Helen female
28 0 1 Fortune, Mr. Charles Alexander male
743 1 1 Ryerson, Miss. Susan Parker "Suzette" female
312 1 1 Ryerson, Miss. Emily Borie female
300 1 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
680 36.0 0 1 PC 17755 512.3292 B51 B53 B55 C
259 35.0 0 0 PC 17755 512.3292 NaN C
738 35.0 0 0 PC 17755 512.3292 B101 C
439 64.0 1 4 19950 263.0000 C23 C25 C27 S
342 24.0 3 2 19950 263.0000 C23 C25 C27 S
89 23.0 3 2 19950 263.0000 C23 C25 C27 S
28 19.0 3 2 19950 263.0000 C23 C25 C27 S
743 21.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
312 18.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
300 50.0 0 1 PC 17558 247.5208 B58 B60 C
'\n根据常识我知道发现票价越高的应该客舱越好,\n所以我们会明显看出,票价前10的乘客中存活的有8人,\n这是相当高的一个比例,后期可以尝试分析票价和年龄的关系,票价和存活率的关系\n'
print ( df[ '票价' ] . describe( ) )
'''
一共有891个票价数据,
平均值约为:32.20,
标准差约为49.69,说明票价波动特别大,
25%的人的票价是低于7.91的,50%的人的票价低于14.45,75%的人的票价低于31.00,
票价最大值约为512.33,最小值为0。
'''
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: 票价, dtype: float64
'\n一共有891个票价数据,\n平均值约为:32.20,\n标准差约为49.69,说明票价波动特别大,\n25%的人的票价是低于7.91的,50%的人的票价低于14.45,75%的人的票价低于31.00,\n票价最大值约为512.33,最小值为0。\n'
print ( df. sort_values( by= [ '仓位等级' , '是否幸存' ] , ascending= False ) . head( 10 ) )
print ( df. sort_values( by= [ '仓位等级' , '是否幸存' ] , ascending= False ) . tail( 10 ) )
'''
通过观察可以发现,前10仓位等级最高者(3级),全部幸存,
后10仓位等级最低者(1级),全部未能幸免
后期可以尝试探索仓位等级与存活的关系
'''
是否幸存 仓位等级 姓名 性别 \
乘客ID
3 1 3 Heikkinen, Miss. Laina female
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
11 1 3 Sandstrom, Miss. Marguerite Rut female
20 1 3 Masselmani, Mrs. Fatima female
23 1 3 McGowan, Miss. Anna "Annie" female
26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female
29 1 3 O'Dwyer, Miss. Ellen "Nellie" female
33 1 3 Glynn, Miss. Mary Agatha female
37 1 3 Mamee, Mr. Hanna male
40 1 3 Nicola-Yarred, Miss. Jamila female
年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
9 27.0 0 2 347742 11.1333 NaN S
11 4.0 1 1 PP 9549 16.7000 G6 S
20 NaN 0 0 2649 7.2250 NaN C
23 15.0 0 0 330923 8.0292 NaN Q
26 38.0 1 5 347077 31.3875 NaN S
29 NaN 0 0 330959 7.8792 NaN Q
33 NaN 0 0 335677 7.7500 NaN Q
37 NaN 0 0 2677 7.2292 NaN C
40 14.0 1 0 2651 11.2417 NaN C
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 \
乘客ID
749 0 1 Marvin, Mr. Daniel Warner male 19.0 1
767 0 1 Brewe, Dr. Arthur Jackson male NaN 0
783 0 1 Long, Mr. Milton Clyde male 29.0 0
790 0 1 Guggenheim, Mr. Benjamin male 46.0 0
794 0 1 Hoyt, Mr. William Fisher male NaN 0
807 0 1 Andrews, Mr. Thomas Jr male 39.0 0
816 0 1 Fry, Mr. Richard male NaN 0
823 0 1 Reuchlin, Jonkheer. John George male 38.0 0
868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0
873 0 1 Carlsson, Mr. Frans Olof male 33.0 0
父母子女个数 船票信息 票价 客舱 登船港口
乘客ID
749 0 113773 53.1000 D30 S
767 0 112379 39.6000 NaN C
783 0 113501 30.0000 D6 S
790 0 PC 17593 79.2000 B82 B84 C
794 0 PC 17600 30.6958 NaN C
807 0 112050 0.0000 A36 S
816 0 112058 0.0000 B102 S
823 0 19972 0.0000 NaN S
868 0 PC 17590 50.4958 A24 S
873 0 695 5.0000 B51 B53 B55 S
'\n通过观察可以发现,前10仓位等级最高者(3级),全部幸存,\n后10仓位等级最低者(1级),全部未能幸免\n后期可以尝试探索仓位等级与存活的关系\n'
print ( df. info( ) )
print ( df. isnull( ) . sum ( ) )
'''
通过观察可以发现,年龄,客舱,登船港口有缺失值
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 891 non-null int64
1 仓位等级 891 non-null int64
2 姓名 891 non-null object
3 性别 891 non-null object
4 年龄 714 non-null float64
5 兄弟姐妹个数 891 non-null int64
6 父母子女个数 891 non-null int64
7 船票信息 891 non-null object
8 票价 891 non-null float64
9 客舱 204 non-null object
10 登船港口 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
是否幸存 0
仓位等级 0
姓名 0
性别 0
年龄 177
兄弟姐妹个数 0
父母子女个数 0
船票信息 0
票价 0
客舱 687
登船港口 2
dtype: int64
'\n通过观察可以发现,年龄,客舱,登船港口有缺失值\n'
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 年龄类别 性别类别 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 5 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 2 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 2 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 4 1
df. fillna( 0 )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 0 S 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 0 S 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 0 S ... ... ... ... ... ... ... ... ... ... ... ... 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 0 S 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 889 0 0 0 0 0.0 0 0 0 0.0000 0 0 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 0 Q
891 rows × 11 columns
df[ df. duplicated( ) ]
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 乘客ID 18 0 0 0 0 0.0 0 0 0 0.0 0 0 20 0 0 0 0 0.0 0 0 0 0.0 0 0 27 0 0 0 0 0.0 0 0 0 0.0 0 0 29 0 0 0 0 0.0 0 0 0 0.0 0 0 30 0 0 0 0 0.0 0 0 0 0.0 0 0 ... ... ... ... ... ... ... ... ... ... ... ... 860 0 0 0 0 0.0 0 0 0 0.0 0 0 864 0 0 0 0 0.0 0 0 0 0.0 0 0 869 0 0 0 0 0.0 0 0 0 0.0 0 0 879 0 0 0 0 0.0 0 0 0 0.0 0 0 889 0 0 0 0 0.0 0 0 0 0.0 0 0
176 rows × 11 columns
df = df. drop_duplicates( )
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 是否幸存 715 non-null int64
1 仓位等级 715 non-null int64
2 姓名 715 non-null object
3 性别 715 non-null object
4 年龄 715 non-null float64
5 兄弟姐妹个数 715 non-null int64
6 父母子女个数 715 non-null int64
7 船票信息 715 non-null object
8 票价 715 non-null float64
9 客舱 186 non-null object
10 登船港口 713 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 67.0+ KB
'''
我们对特征进行一下观察,可以把特征大概分为两大类:
数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,
其中Survived, Pclass为离散型数值特征,
Age,SibSp, Parch, Fare为连续型数值特征
文本型特征:Name, Sex, Cabin,Embarked, Ticket,
其中Sex, Cabin, Embarked, Ticket为类别型文本特征。
数值型特征一般可以直接用于模型的训练,
但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。
文本型特征往往需要转换成数值型特征才能用于建模分析。
'''
'\n我们对特征进行一下观察,可以把特征大概分为两大类: \n数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,\n其中Survived, Pclass为离散型数值特征,\nAge,SibSp, Parch, Fare为连续型数值特征 \n\n文本型特征:Name, Sex, Cabin,Embarked, Ticket,\n其中Sex, Cabin, Embarked, Ticket为类别型文本特征。\n\n数值型特征一般可以直接用于模型的训练,\n但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。\n文本型特征往往需要转换成数值型特征才能用于建模分析。\n'
df[ '年龄类别' ] = pd. cut( df[ '年龄' ] , 5 , labels = [ 1 , 2 , 3 , 4 , 5 ] )
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 年龄类别 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 3 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 2 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 3 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 3
df[ '年龄类别' ] = pd. cut( df[ '年龄' ] , [ 0 , 5 , 15 , 30 , 50 , 80 ] , labels = [ 1 , 2 , 3 , 4 , 5 ] )
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 年龄类别 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 3 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 4 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 4
df[ '年龄类别' ] = pd. qcut( df[ '年龄' ] , [ 0 , 0.1 , 0.3 , 0.5 , 0.7 , 0.9 ] , labels = [ 1 , 2 , 3 , 4 , 5 ] )
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 年龄类别 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 5 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 4
print ( df[ '性别' ] . value_counts( ) )
print ( df[ '登船港口' ] . value_counts( ) )
print ( df[ '客舱' ] . value_counts( ) )
male 453
female 261
0 1
Name: 性别, dtype: int64
S 554
C 130
Q 28
0 1
Name: 登船港口, dtype: int64
G6 4
C23 C25 C27 4
B96 B98 4
F2 3
F33 3
..
A6 1
C104 1
B39 1
B69 1
0 1
Name: 客舱, Length: 135, dtype: int64
df[ '性别类别' ] = df[ '性别' ] . replace( [ 'male' , 'female' ] , [ 1 , 2 ] )
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 登船港口 年龄类别 性别类别 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 2 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 5 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 2 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 2 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 4 1
from sklearn. preprocessing import LabelEncoder
df[ '客舱' ] = LabelEncoder( ) . fit_transform( df[ '客舱' ] )
df. head( )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-28-a090f7066f88> in <module>
1 from sklearn.preprocessing import LabelEncoder
----> 2 df['客舱'] = LabelEncoder().fit_transform(df['客舱'])
3 df.head()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
110 """
111 y = column_or_1d(y, warn=True)
--> 112 self.classes_, y = np.unique(y, return_inverse=True)
113 return y
114
<__array_function__ internals> in unique(*args, **kwargs)
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
317
318 if optional_indices:
--> 319 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
320 aux = ar[perm]
321 else:
TypeError: '<' not supported between instances of 'str' and 'float'
df[ '登船港口' ] = LabelEncoder( ) . fit_transform( df[ '登船港口' ] )
df. head( )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-f52d686f7adb> in <module>
----> 1 df['登船港口'] = LabelEncoder().fit_transform(df['登船港口'])
2 df.head()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
110 """
111 y = column_or_1d(y, warn=True)
--> 112 self.classes_, y = np.unique(y, return_inverse=True)
113 return y
114
<__array_function__ internals> in unique(*args, **kwargs)
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
317
318 if optional_indices:
--> 319 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
320 aux = ar[perm]
321 else:
TypeError: '<' not supported between instances of 'int' and 'str'
x = pd. get_dummies( df[ '年龄' ] , prefix= '年龄' )
df = pd. concat( [ df, x] , axis= 1 )
df
df. to_csv( 'train_onehot.csv' )
for column in [ '客舱' , '登船港口' ] :
x = pd. get_dummies( df[ column] , prefix= column)
df = pd. concat( [ df, x] , axis= 1 )
df. head( )
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 ... 客舱_F G73 客舱_F2 客舱_F33 客舱_F4 客舱_G6 客舱_T 登船港口_0 登船港口_C 登船港口_Q 登船港口_S 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN ... 0 0 0 0 0 0 0 0 0 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 ... 0 0 0 0 0 0 0 1 0 0 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN ... 0 0 0 0 0 0 0 0 0 1 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 ... 0 0 0 0 0 0 0 0 0 1 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN ... 0 0 0 0 0 0 0 0 0 1
5 rows × 643 columns
df[ 'Title' ] = df. 姓名. str . extract( '([A-Za-z]+)\.' )
df
是否幸存 仓位等级 姓名 性别 年龄 兄弟姐妹个数 父母子女个数 船票信息 票价 客舱 ... 客舱_F2 客舱_F33 客舱_F4 客舱_G6 客舱_T 登船港口_0 登船港口_C 登船港口_Q 登船港口_S Title 乘客ID 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN ... 0 0 0 0 0 0 0 0 1 Mr 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 ... 0 0 0 0 0 0 1 0 0 Mrs 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN ... 0 0 0 0 0 0 0 0 1 Miss 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 ... 0 0 0 0 0 0 0 0 1 Mrs 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN ... 0 0 0 0 0 0 0 0 1 Mr ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NaN ... 0 0 0 0 0 0 0 1 0 Mrs 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN ... 0 0 0 0 0 0 0 0 1 Rev 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 ... 0 0 0 0 0 0 0 0 1 Miss 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 ... 0 0 0 0 0 0 1 0 0 Mr 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN ... 0 0 0 0 0 0 0 1 0 Mr
715 rows × 644 columns
df. to_csv( 'test_fin.csv' )