泰坦尼克号 Notebook

# 导入库
# 将 numpy 和 pandas 导入并命名为np、pd
import numpy as np
import pandas as pd
# 使用相对路径导入csv数据,并
df = pd.read_csv('train.csv')

# 展示数据的前三行 —— 观察数据
print(df.head(3))
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
# 导入数据,(names)重命名列名和(index_col)行索引名,并(header=0)忽略原始列名
df = pd.read_csv('train.csv', names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口'],index_col='乘客ID',header=0)

# 展示数据的前三行 —— 观察数据
print(df.head(3))
      是否幸存  仓位等级                                                 姓名      性别  \
乘客ID                                                                          
1        0     3                            Braund, Mr. Owen Harris    male   
2        1     1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
3        1     3                             Heikkinen, Miss. Laina  female   

        年龄  兄弟姐妹个数  父母子女个数              船票信息       票价   客舱 登船港口  
乘客ID                                                             
1     22.0       1       0         A/5 21171   7.2500  NaN    S  
2     38.0       1       0          PC 17599  71.2833  C85    C  
3     26.0       0       0  STON/O2. 3101282   7.9250  NaN    S  
# 查看数据的基本信息(info) 每一列的非空值的个数、数据类型、文件大小 —— 观察数据
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   是否幸存    891 non-null    int64  
 1   仓位等级    891 non-null    int64  
 2   姓名      891 non-null    object 
 3   性别      891 non-null    object 
 4   年龄      714 non-null    float64
 5   兄弟姐妹个数  891 non-null    int64  
 6   父母子女个数  891 non-null    int64  
 7   船票信息    891 non-null    object 
 8   票价      891 non-null    float64
 9   客舱      204 non-null    object 
 10  登船港口    889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
# 观察前(head)10行和后(tail)10行 —— 观察数据
print(df.head(10))
print(df.tail(10))
      是否幸存  仓位等级                                                 姓名      性别  \
乘客ID                                                                          
1        0     3                            Braund, Mr. Owen Harris    male   
2        1     1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
3        1     3                             Heikkinen, Miss. Laina  female   
4        1     1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
5        0     3                           Allen, Mr. William Henry    male   
6        0     3                                   Moran, Mr. James    male   
7        0     1                            McCarthy, Mr. Timothy J    male   
8        0     3                     Palsson, Master. Gosta Leonard    male   
9        1     3  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
10       1     2                Nasser, Mrs. Nicholas (Adele Achem)  female   

        年龄  兄弟姐妹个数  父母子女个数              船票信息       票价    客舱 登船港口  
乘客ID                                                              
1     22.0       1       0         A/5 21171   7.2500   NaN    S  
2     38.0       1       0          PC 17599  71.2833   C85    C  
3     26.0       0       0  STON/O2. 3101282   7.9250   NaN    S  
4     35.0       1       0            113803  53.1000  C123    S  
5     35.0       0       0            373450   8.0500   NaN    S  
6      NaN       0       0            330877   8.4583   NaN    Q  
7     54.0       0       0             17463  51.8625   E46    S  
8      2.0       3       1            349909  21.0750   NaN    S  
9     27.0       0       2            347742  11.1333   NaN    S  
10    14.0       1       0            237736  30.0708   NaN    C  
      是否幸存  仓位等级                                        姓名      性别    年龄  \
乘客ID                                                                       
882      0     3                        Markun, Mr. Johann    male  33.0   
883      0     3              Dahlberg, Miss. Gerda Ulrika  female  22.0   
884      0     2             Banfield, Mr. Frederick James    male  28.0   
885      0     3                    Sutehall, Mr. Henry Jr    male  25.0   
886      0     3      Rice, Mrs. William (Margaret Norton)  female  39.0   
887      0     2                     Montvila, Rev. Juozas    male  27.0   
888      1     1              Graham, Miss. Margaret Edith  female  19.0   
889      0     3  Johnston, Miss. Catherine Helen "Carrie"  female   NaN   
890      1     1                     Behr, Mr. Karl Howell    male  26.0   
891      0     3                       Dooley, Mr. Patrick    male  32.0   

      兄弟姐妹个数  父母子女个数              船票信息       票价    客舱 登船港口  
乘客ID                                                        
882        0       0            349257   7.8958   NaN    S  
883        0       0              7552  10.5167   NaN    S  
884        0       0  C.A./SOTON 34068  10.5000   NaN    S  
885        0       0   SOTON/OQ 392076   7.0500   NaN    S  
886        0       5            382652  29.1250   NaN    Q  
887        0       0            211536  13.0000   NaN    S  
888        0       0            112053  30.0000   B42    S  
889        1       2        W./C. 6607  23.4500   NaN    S  
890        0       0            111369  30.0000  C148    C  
891        0       0            370376   7.7500   NaN    Q  
# 判断数据是否为空 (返回一个true/false) —— 观察数据
df.isnull()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口
乘客ID
1FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
5FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
....................................
887FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
888FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
889FalseFalseFalseFalseTrueFalseFalseFalseFalseTrueFalse
890FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
891FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse

891 rows × 11 columns

# 另存为当前的以改变数据(to csv)为csv —— 保存数据
df.to_csv('tain_cn.csv')
# 查询 对票价和年龄降序排序后的前10行
print(df.sort_values(by=['票价','年龄'],ascending=False).head(10))
'''
根据常识我知道发现票价越高的应该客舱越好,
所以我们会明显看出,票价前10的乘客中存活的有8人,
这是相当高的一个比例,后期可以尝试分析票价和年龄的关系,票价和存活率的关系
'''

      是否幸存  仓位等级                                               姓名      性别  \
乘客ID                                                                        
680      1     1               Cardeza, Mr. Thomas Drake Martinez    male   
259      1     1                                 Ward, Miss. Anna  female   
738      1     1                           Lesurer, Mr. Gustave J    male   
439      0     1                                Fortune, Mr. Mark    male   
342      1     1                   Fortune, Miss. Alice Elizabeth  female   
89       1     1                       Fortune, Miss. Mabel Helen  female   
28       0     1                   Fortune, Mr. Charles Alexander    male   
743      1     1            Ryerson, Miss. Susan Parker "Suzette"  female   
312      1     1                       Ryerson, Miss. Emily Borie  female   
300      1     1  Baxter, Mrs. James (Helene DeLaudeniere Chaput)  female   

        年龄  兄弟姐妹个数  父母子女个数      船票信息        票价               客舱 登船港口  
乘客ID                                                                  
680   36.0       0       1  PC 17755  512.3292      B51 B53 B55    C  
259   35.0       0       0  PC 17755  512.3292              NaN    C  
738   35.0       0       0  PC 17755  512.3292             B101    C  
439   64.0       1       4     19950  263.0000      C23 C25 C27    S  
342   24.0       3       2     19950  263.0000      C23 C25 C27    S  
89    23.0       3       2     19950  263.0000      C23 C25 C27    S  
28    19.0       3       2     19950  263.0000      C23 C25 C27    S  
743   21.0       2       2  PC 17608  262.3750  B57 B59 B63 B66    C  
312   18.0       2       2  PC 17608  262.3750  B57 B59 B63 B66    C  
300   50.0       0       1  PC 17558  247.5208          B58 B60    C  





'\n根据常识我知道发现票价越高的应该客舱越好,\n所以我们会明显看出,票价前10的乘客中存活的有8人,\n这是相当高的一个比例,后期可以尝试分析票价和年龄的关系,票价和存活率的关系\n'
# 查询票价的描述性统计信息
print(df['票价'].describe())
'''
一共有891个票价数据,
平均值约为:32.20,
标准差约为49.69,说明票价波动特别大,
25%的人的票价是低于7.91的,50%的人的票价低于14.45,75%的人的票价低于31.00,
票价最大值约为512.33,最小值为0。
'''
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: 票价, dtype: float64





'\n一共有891个票价数据,\n平均值约为:32.20,\n标准差约为49.69,说明票价波动特别大,\n25%的人的票价是低于7.91的,50%的人的票价低于14.45,75%的人的票价低于31.00,\n票价最大值约为512.33,最小值为0。\n'
# 对仓位等级和存活进行降序 查看前10行和后10行数据
print(df.sort_values(by=['仓位等级','是否幸存'],ascending=False).head(10))
print(df.sort_values(by=['仓位等级','是否幸存'],ascending=False).tail(10))
'''
通过观察可以发现,前10仓位等级最高者(3级),全部幸存,
后10仓位等级最低者(1级),全部未能幸免
后期可以尝试探索仓位等级与存活的关系
'''

# 可以进行更多的排序观察,如年龄,性别等,并观察其描述性统计
      是否幸存  仓位等级                                                 姓名      性别  \
乘客ID                                                                          
3        1     3                             Heikkinen, Miss. Laina  female   
9        1     3  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
11       1     3                    Sandstrom, Miss. Marguerite Rut  female   
20       1     3                            Masselmani, Mrs. Fatima  female   
23       1     3                        McGowan, Miss. Anna "Annie"  female   
26       1     3  Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...  female   
29       1     3                      O'Dwyer, Miss. Ellen "Nellie"  female   
33       1     3                           Glynn, Miss. Mary Agatha  female   
37       1     3                                   Mamee, Mr. Hanna    male   
40       1     3                        Nicola-Yarred, Miss. Jamila  female   

        年龄  兄弟姐妹个数  父母子女个数              船票信息       票价   客舱 登船港口  
乘客ID                                                             
3     26.0       0       0  STON/O2. 3101282   7.9250  NaN    S  
9     27.0       0       2            347742  11.1333  NaN    S  
11     4.0       1       1           PP 9549  16.7000   G6    S  
20     NaN       0       0              2649   7.2250  NaN    C  
23    15.0       0       0            330923   8.0292  NaN    Q  
26    38.0       1       5            347077  31.3875  NaN    S  
29     NaN       0       0            330959   7.8792  NaN    Q  
33     NaN       0       0            335677   7.7500  NaN    Q  
37     NaN       0       0              2677   7.2292  NaN    C  
40    14.0       1       0              2651  11.2417  NaN    C  
      是否幸存  仓位等级                                    姓名    性别    年龄  兄弟姐妹个数  \
乘客ID                                                                         
749      0     1             Marvin, Mr. Daniel Warner  male  19.0       1   
767      0     1             Brewe, Dr. Arthur Jackson  male   NaN       0   
783      0     1                Long, Mr. Milton Clyde  male  29.0       0   
790      0     1              Guggenheim, Mr. Benjamin  male  46.0       0   
794      0     1              Hoyt, Mr. William Fisher  male   NaN       0   
807      0     1                Andrews, Mr. Thomas Jr  male  39.0       0   
816      0     1                      Fry, Mr. Richard  male   NaN       0   
823      0     1       Reuchlin, Jonkheer. John George  male  38.0       0   
868      0     1  Roebling, Mr. Washington Augustus II  male  31.0       0   
873      0     1              Carlsson, Mr. Frans Olof  male  33.0       0   

      父母子女个数      船票信息       票价           客舱 登船港口  
乘客ID                                               
749        0    113773  53.1000          D30    S  
767        0    112379  39.6000          NaN    C  
783        0    113501  30.0000           D6    S  
790        0  PC 17593  79.2000      B82 B84    C  
794        0  PC 17600  30.6958          NaN    C  
807        0    112050   0.0000          A36    S  
816        0    112058   0.0000         B102    S  
823        0     19972   0.0000          NaN    S  
868        0  PC 17590  50.4958          A24    S  
873        0       695   5.0000  B51 B53 B55    S  





'\n通过观察可以发现,前10仓位等级最高者(3级),全部幸存,\n后10仓位等级最低者(1级),全部未能幸免\n后期可以尝试探索仓位等级与存活的关系\n'
# 缺失值观察和处理

# 法1:info 返回非空值的个数
print(df.info())

# 法2:计算空缺值个数的和
print(df.isnull().sum())
'''
通过观察可以发现,年龄,客舱,登船港口有缺失值
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   是否幸存    891 non-null    int64  
 1   仓位等级    891 non-null    int64  
 2   姓名      891 non-null    object 
 3   性别      891 non-null    object 
 4   年龄      714 non-null    float64
 5   兄弟姐妹个数  891 non-null    int64  
 6   父母子女个数  891 non-null    int64  
 7   船票信息    891 non-null    object 
 8   票价      891 non-null    float64
 9   客舱      204 non-null    object 
 10  登船港口    889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
是否幸存        0
仓位等级        0
姓名          0
性别          0
年龄        177
兄弟姐妹个数      0
父母子女个数      0
船票信息        0
票价          0
客舱        687
登船港口        2
dtype: int64





'\n通过观察可以发现,年龄,客舱,登船港口有缺失值\n'
# 删除缺失值

# 删除含有缺失值的列和行
# df.dropna()

# 找到缺失值并赋值为 0 
# df[df['年龄'].isna()] = 0
df.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口年龄类别性别类别
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS21
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C52
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS32
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S42
503Allen, Mr. William Henrymale35.0003734508.0500NaNS41
# 填充空值 fillna() 用0来填空所有数值型的空值
df.fillna(0)
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.25000S
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.92500S
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.05000S
....................................
88702Montvila, Rev. Juozasmale27.00021153613.00000S
88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88900000.00000.000000
89011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89103Dooley, Mr. Patrickmale32.0003703767.75000Q

891 rows × 11 columns

# 查看数据中的重复值
df[df.duplicated()]

是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口
乘客ID
1800000.00000.000
2000000.00000.000
2700000.00000.000
2900000.00000.000
3000000.00000.000
....................................
86000000.00000.000
86400000.00000.000
86900000.00000.000
87900000.00000.000
88900000.00000.000

176 rows × 11 columns

# 清理重复值
df = df.drop_duplicates()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   是否幸存    715 non-null    int64  
 1   仓位等级    715 non-null    int64  
 2   姓名      715 non-null    object 
 3   性别      715 non-null    object 
 4   年龄      715 non-null    float64
 5   兄弟姐妹个数  715 non-null    int64  
 6   父母子女个数  715 non-null    int64  
 7   船票信息    715 non-null    object 
 8   票价      715 non-null    float64
 9   客舱      186 non-null    object 
 10  登船港口    713 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 67.0+ KB
# 特征观察与处理
'''
我们对特征进行一下观察,可以把特征大概分为两大类:  
数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,
其中Survived, Pclass为离散型数值特征,
Age,SibSp, Parch, Fare为连续型数值特征  

文本型特征:Name, Sex, Cabin,Embarked, Ticket,
其中Sex, Cabin, Embarked, Ticket为类别型文本特征。

数值型特征一般可以直接用于模型的训练,
但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。
文本型特征往往需要转换成数值型特征才能用于建模分析。
'''
'\n我们对特征进行一下观察,可以把特征大概分为两大类:  \n数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,\n其中Survived, Pclass为离散型数值特征,\nAge,SibSp, Parch, Fare为连续型数值特征  \n\n文本型特征:Name, Sex, Cabin,Embarked, Ticket,\n其中Sex, Cabin, Embarked, Ticket为类别型文本特征。\n\n数值型特征一般可以直接用于模型的训练,\n但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。\n文本型特征往往需要转换成数值型特征才能用于建模分析。\n'
# 将连续变量Age平均分箱成5个年龄段,并分别用类别变量12345表示
df['年龄类别'] = pd.cut(df['年龄'], 5,labels = [1,2,3,4,5])
df.head()

是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口年龄类别
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS2
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C3
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS2
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S3
503Allen, Mr. William Henrymale35.0003734508.0500NaNS3
# #将连续变量Age划分为(0,5] (5,15] (15,30] (30,50] (50,80]五个年龄段,并分别用类别变量12345表示
df['年龄类别'] = pd.cut(df['年龄'],[0,5,15,30,50,80],labels = [1,2,3,4,5])
df.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口年龄类别
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS3
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C4
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
#将连续变量Age按10% 30% 50 70% 90%五个年龄段,并用分类变量12345表示
df['年龄类别'] = pd.qcut(df['年龄'],[0,0.1,0.3,0.5,0.7,0.9],labels = [1,2,3,4,5])
df.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口年龄类别
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS2
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C5
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
print(df['性别'].value_counts()) # 查看'Sex'列中的变量及种类
print(df['登船港口'].value_counts()) # 查看'Cabin'列中的变量及种类
print(df['客舱'].value_counts()) # 查看'Embarked'列中的变量及种类
male      453
female    261
0           1
Name: 性别, dtype: int64
S    554
C    130
Q     28
0      1
Name: 登船港口, dtype: int64
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
F33            3
              ..
A6             1
C104           1
B39            1
B69            1
0              1
Name: 客舱, Length: 135, dtype: int64
#将类别文本转换为12345

#方法一: replace
# 男1女2
# replace() 方法把字符串中的 old(旧字符串) 替换成 new(新字符串)
df['性别类别'] = df['性别'].replace(['male','female'],[1,2])
df.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口年龄类别性别类别
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS21
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C52
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS32
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S42
503Allen, Mr. William Henrymale35.0003734508.0500NaNS41
from sklearn.preprocessing import LabelEncoder
df['客舱'] = LabelEncoder().fit_transform(df['客舱'])
df.head()
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-28-a090f7066f88> in <module>
      1 from sklearn.preprocessing import LabelEncoder
----> 2 df['客舱'] = LabelEncoder().fit_transform(df['客舱'])
      3 df.head()


/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
    110         """
    111         y = column_or_1d(y, warn=True)
--> 112         self.classes_, y = np.unique(y, return_inverse=True)
    113         return y
    114 


<__array_function__ internals> in unique(*args, **kwargs)


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    259     ar = np.asanyarray(ar)
    260     if axis is None:
--> 261         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    262         return _unpack_tuple(ret)
    263 


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    317 
    318     if optional_indices:
--> 319         perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
    320         aux = ar[perm]
    321     else:


TypeError: '<' not supported between instances of 'str' and 'float'
df['登船港口'] = LabelEncoder().fit_transform(df['登船港口'])
df.head()

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-34-f52d686f7adb> in <module>
----> 1 df['登船港口'] = LabelEncoder().fit_transform(df['登船港口'])
      2 df.head()


/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
    110         """
    111         y = column_or_1d(y, warn=True)
--> 112         self.classes_, y = np.unique(y, return_inverse=True)
    113         return y
    114 


<__array_function__ internals> in unique(*args, **kwargs)


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    259     ar = np.asanyarray(ar)
    260     if axis is None:
--> 261         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    262         return _unpack_tuple(ret)
    263 


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    317 
    318     if optional_indices:
--> 319         perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
    320         aux = ar[perm]
    321     else:


TypeError: '<' not supported between instances of 'int' and 'str'
# one hot编码
# 变成01序列 节省计算性能
x = pd.get_dummies(df['年龄'],prefix='年龄')

# 将数据拼接到原始数据上 按列拼接
df = pd.concat([df,x],axis=1)
df
df.to_csv('train_onehot.csv')
for column in ['客舱','登船港口']:
    x = pd.get_dummies(df[column],prefix= column)
    df = pd.concat([df,x],axis=1)
df.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱...客舱_F G73客舱_F2客舱_F33客舱_F4客舱_G6客舱_T登船港口_0登船港口_C登船港口_Q登船港口_S
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaN...0000000001
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85...0000000100
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaN...0000000001
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123...0000000001
503Allen, Mr. William Henrymale35.0003734508.0500NaN...0000000001

5 rows × 643 columns

# series.str.extract 字符串提取表达式
# pat 正则表达式
# flags 
df['Title'] = df.姓名.str.extract('([A-Za-z]+)\.')
df
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱...客舱_F2客舱_F33客舱_F4客舱_G6客舱_T登船港口_0登船港口_C登船港口_Q登船港口_STitle
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaN...000000001Mr
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85...000000100Mrs
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaN...000000001Miss
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123...000000001Mrs
503Allen, Mr. William Henrymale35.0003734508.0500NaN...000000001Mr
..................................................................
88603Rice, Mrs. William (Margaret Norton)female39.00538265229.1250NaN...000000010Mrs
88702Montvila, Rev. Juozasmale27.00021153613.0000NaN...000000001Rev
88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42...000000001Miss
89011Behr, Mr. Karl Howellmale26.00011136930.0000C148...000000100Mr
89103Dooley, Mr. Patrickmale32.0003703767.7500NaN...000000010Mr

715 rows × 644 columns

df.to_csv('test_fin.csv')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

霖承科技 LinChance

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值