import numpy as np
import pandas as pd
df1= pd. read_csv( r"C:\Jupyter.Data\titanic\hands-on-data-analysis-master\data.charpter2\train-left-down.csv" )
df2= pd. read_csv( r"C:\Jupyter.Data\titanic\hands-on-data-analysis-master\data.charpter2\train-left-up.csv" )
df3= pd. read_csv( r"C:\Jupyter.Data\titanic\hands-on-data-analysis-master\data.charpter2\train-right-down.csv" )
df4= pd. read_csv( r"C:\Jupyter.Data\titanic\hands-on-data-analysis-master\data.charpter2\train-right-up.csv" )
cancat 方法实现
df1. head( )
PassengerId Survived Pclass Name 0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson 1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) 2 442 0 3 Hampe, Mr. Leon 3 443 0 3 Petterson, Mr. Johan Emil 4 444 1 2 Reynaldo, Ms. Encarnacion
df2. head( )
PassengerId Survived Pclass Name 0 1 0 3 Braund, Mr. Owen Harris 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 3 1 3 Heikkinen, Miss. Laina 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 5 0 3 Allen, Mr. William Henry
df3. head( )
Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 male 31.0 0 0 C.A. 18723 10.500 NaN S 1 female 45.0 1 1 F.C.C. 13529 26.250 NaN S 2 male 20.0 0 0 345769 9.500 NaN S 3 male 25.0 1 0 347076 7.775 NaN S 4 female 28.0 0 0 230434 13.000 NaN S
df4. head( )
Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 male 22.0 1 0 A/5 21171 7.2500 NaN S 1 female 38.0 1 0 PC 17599 71.2833 C85 C 2 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 female 35.0 1 0 113803 53.1000 C123 S 4 male 35.0 0 0 373450 8.0500 NaN S
df_result_up = pd. concat( [ df2, df4] , axis= 1 )
df_result_up. to_csv( "result_up.csv" )
df_result_down = pd. concat( [ df1, df3] , axis= 1 )
df_result_down. to_csv( "result_down.csv" )
df = pd. concat( [ df_result_down, df_result_up] , axis= 0 )
df. to_csv( "result.csv" )
df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson male 31.0 0 0 C.A. 18723 10.500 NaN S 1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) female 45.0 1 1 F.C.C. 13529 26.250 NaN S 2 442 0 3 Hampe, Mr. Leon male 20.0 0 0 345769 9.500 NaN S 3 443 0 3 Petterson, Mr. Johan Emil male 25.0 1 0 347076 7.775 NaN S 4 444 1 2 Reynaldo, Ms. Encarnacion female 28.0 0 0 230434 13.000 NaN S
df. shape
(891, 12)
append 和 join方法实现
df_down = df1. join( df3)
df_up = df2. join( df4)
df = df_down. append( df_up)
df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 440 0 2 Kvillner, Mr. Johan Henrik Johannesson male 31.0 0 0 C.A. 18723 10.500 NaN S 1 441 1 2 Hart, Mrs. Benjamin (Esther Ada Bloomfield) female 45.0 1 1 F.C.C. 13529 26.250 NaN S 2 442 0 3 Hampe, Mr. Leon male 20.0 0 0 345769 9.500 NaN S 3 443 0 3 Petterson, Mr. Johan Emil male 25.0 1 0 347076 7.775 NaN S 4 444 1 2 Reynaldo, Ms. Encarnacion female 28.0 0 0 230434 13.000 NaN S
merge和append方法实现
df_down = pd. merge( df1, df3, left_index= True , right_index= True )
df_up = pd. merge( df2, df4, left_index= True , right_index= True )
df= df_up. append( df_down)
df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
【思考】单独使用不能完成,接不上。
df. to_csv( "result.csv" )
将数据转为Series类型
df. stack( ) . head( 20 )
0 PassengerId 1
Survived 0
Pclass 3
Name Braund, Mr. Owen Harris
Sex male
Age 22
SibSp 1
Parch 0
Ticket A/5 21171
Fare 7.25
Embarked S
1 PassengerId 2
Survived 1
Pclass 1
Name Cumings, Mrs. John Bradley (Florence Briggs Th...
Sex female
Age 38
SibSp 1
Parch 0
Ticket PC 17599
dtype: object
df. stack( ) . to_csv( "stack.csv" )
stack = pd. read_csv( "stack.csv" )
stack. head( )
0 PassengerId 1 0 0 Survived 0 1 0 Pclass 3 2 0 Name Braund, Mr. Owen Harris 3 0 Sex male 4 0 Age 22.0
数据聚合与运算
Groupby类似于excel数据透视表和分类汇总,只是功能更加强大了些。
df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
df. groupby( "Sex" ) [ "Fare" ] . mean( )
Sex
female 44.479818
male 25.523893
Name: Fare, dtype: float64
df. groupby( "Sex" ) [ "Survived" ] . sum ( )
Sex
female 233
male 109
Name: Survived, dtype: int64
df. groupby( "Embarked" ) [ "Survived" ] . sum ( )
Embarked
C 93
Q 30
S 217
Name: Survived, dtype: int64
结论:(1)女性平均票价高于男性;(2)女性存活人数高于男性;(3)S登记仓位存活人数远高于其他舱位。
Table = df. groupby( "Sex" ) . agg( { "Fare" : [ "mean" ] , "Survived" : [ "sum" ] } )
Table = Table. rename( columns= { "Fare" : "Fare_mean" , "Survived" : "Survived_mean" } )
Table
Fare_mean Survived_mean mean sum Sex female 44.479818 233 male 25.523893 109
df. groupby( [ 'Pclass' , 'Age' ] ) [ 'Fare' ] . mean( ) . head( )
Pclass Age
1 0.92 151.5500
2.00 151.5500
4.00 81.8583
11.00 120.0000
14.00 120.0000
Name: Fare, dtype: float64
df_sex_fare = df. groupby( "Sex" ) [ "Fare" ] . mean( )
df_sex_survived = df. groupby( "Sex" ) [ "Survived" ] . sum ( )
df_fare_survived = pd. merge( df_sex_fare. reset_index( ) , df_sex_survived. reset_index( ) , on= "Sex" )
df_fare_survived. head( )
Sex Fare Survived 0 female 44.479818 233 1 male 25.523893 109
df_fare_survived. to_csv( "Fare_survived.csv" )
存活率最大的年龄段
df. describe( )
PassengerId Survived Pclass Age SibSp Parch Fare count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000 mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208 std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429 min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400 50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000 max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
df[ 'AgeBand' ] = pd. cut( df[ 'Age' ] , [ 0 , 5 , 15 , 30 , 55 , 80 ] , labels = [ "Baby" , "Younth" , "Adult" , "Middle-age" , "Old-man" ] )
df. head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked AgeBand 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S Adult 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C Middle-age 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S Adult 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S Middle-age 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S Middle-age
df_rate = df. groupby( "AgeBand" ) [ "Survived" ] . agg( { "sum" , "count" , lambda x: x. sum ( ) / x. count( ) } )
df_rate. columns = df_rate. columns. map ( '' . join) . str . replace( '<lambda>' , 'survived_rate' )
df_rate
sum count survived_rate AgeBand Baby 31 44 0.704545 Younth 18 39 0.461538 Adult 117 326 0.358896 Middle-age 112 265 0.422642 Old-man 12 40 0.300000
df_rate[ "survived_rate" ] . sort_values( ascending= False )
max_group = df_rate. index[ 0 ]
max_rate = df_rate. survived_rate[ 0 ]
print ( "存活率最高的组为:%s;存活率为:%.2f%%" % ( max_group, 100 * max_rate) )
存活率最高的组为:Baby;存活率为:70.45%