import numpy as np
import pandas as pd
from pandas import Series , DataFrame
s1= pd. Series( [ 1 , 2 , 3 , 4 ] , index= [ 'a' , 'b' , 'c' , 'd' ] )
s1
a 1
b 2
c 3
d 4
dtype: int64
s1. values
array([1, 2, 3, 4], dtype=int64)
s1. index
Index(['a', 'b', 'c', 'd'], dtype='object')
s2= pd. Series( np. arange( 10 ) )
s2
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
s3= pd. Series( { 'a' : 1 , 'b' : 3 , 'c' : 2 , 'd' : 4 } )
s3
a 1
b 3
c 2
d 4
dtype: int64
s3[ 'a' ]
1
s3[ s3> 2 ]
b 3
d 4
dtype: int64
s1. to_dict( )
{'a': 1, 'b': 2, 'c': 3, 'd': 4}
pd. isnull( s3)
a False
b False
c False
d False
dtype: bool
pd. notnull( s3)
a True
b True
c True
d True
dtype: bool
s1= s1. reindex( index= [ 'a' , "b" , "c" , "d" , "e" , "f" ] , fill_value= 0 )
s1[ 'g' ] = 1
s1
a 1
b 2
c 3
d 4
e 0
f 0
g 1
dtype: int64
s1= s1. reindex( index= [ 'a' , "b" , "c" , "d" , "e" , "f" , "k" ] , method= "ffill" )
s1
a 1
b 2
c 3
d 4
e 0
f 0
k 1
dtype: int64
s1. drop( 'f' )
a 1
b 2
c 3
d 4
e 0
k 1
dtype: int64
s4= Series( [ 1 , 2 , np. nan, 3 , 4 ] , index= [ 'A' , 'B' , 'C' , 'D' , 'E' ] )
s4
A 1.0
B 2.0
C NaN
D 3.0
E 4.0
dtype: float64
s4. isnull( )
A False
B False
C True
D False
E False
dtype: bool
s4. notnull( )
A True
B True
C False
D True
E True
dtype: bool
s4. dropna( )
A 1.0
B 2.0
D 3.0
E 4.0
dtype: float64
s5= Series( np. random. randn( 6 ) , index= [ [ '1' , '1' , '1' , '2' , '2' , '2' ] , [ 'a' , 'b' , 'c' , 'a' , 'b' , 'c' ] ] )
s5
1 a 1.422298
b 1.634116
c -0.038048
2 a 0.857658
b 0.416641
c -0.025056
dtype: float64
print ( s5[ '1' ] )
print ( s5[ '1' ] [ 'a' ] )
print ( s5[ : , 'a' ] )
a 1.422298
b 1.634116
c -0.038048
dtype: float64
1.4222976799728857
1 1.422298
2 0.857658
dtype: float64
df= s5. unstack( )
df
a b c 1 1.422298 1.634116 -0.038048 2 0.857658 0.416641 -0.025056
df= DataFrame( [ s5[ '1' ] , s5[ '2' ] ] )
df
a b c 0 1.422298 1.634116 -0.038048 1 0.857658 0.416641 -0.025056
s6= df. unstack( )
s6
a 0 1.422298
1 0.857658
b 0 1.634116
1 0.416641
c 0 -0.038048
1 -0.025056
dtype: float64
s6= df. T. unstack( )
s6
0 a 1.422298
b 1.634116
c -0.038048
1 a 0.857658
b 0.416641
c -0.025056
dtype: float64
s1
a 1
b 2
c 3
d 4
e 0
f 0
k 1
dtype: int64
s1= s1. replace( 0 , 100 )
s1
a 1
b 2
c 3
d 4
e 100
f 100
k 1
dtype: int64
s2= s1
s1+ s2
a 2
b 4
c 6
d 8
e 200
f 200
k 2
dtype: int64
s2= s1. sort_values( ascending= False )
s2
f 100
e 100
d 4
c 3
b 2
k 1
a 1
dtype: int64
s2= s1. sort_index( ascending= False )
s2
k 1
f 100
e 100
d 4
c 3
b 2
a 1
dtype: int64
s11= Series( [ 1 , 2 , np. nan] , index= [ 'a' , 'b' , 'c' ] )
s12= Series( [ 3 , 4 , 5 ] , index= [ 'a' , 'b' , 'c' ] )
s11
a 1.0
b 2.0
c NaN
dtype: float64
s12
a 3
b 4
c 5
dtype: int64
pd. concat( [ s11, s12] )
a 1.0
b 2.0
c NaN
a 3.0
b 4.0
c 5.0
dtype: float64
pd. concat( [ s11, s12] , axis= 1 )
s11. combine_first( s12)
a 1.0
b 2.0
c 5.0
dtype: float64
df= pd. read_csv( "E:\微云\Python3数据分析与挖掘建模实战\书籍+随堂源码+说明\sample_code\data\HR.csv" )
type ( df)
pandas.core.frame.DataFrame
df. head( )
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary 0 0.38 0.53 2 157 3 0 1 0 sales low 1 0.80 0.86 5 262 6 0 1 0 sales medium 2 0.11 0.88 7 272 4 0 1 0 sales medium 3 0.72 0.87 5 223 5 0 1 0 sales low 4 0.37 0.52 2 159 3 0 1 0 sales low
columns= df. columns
columns[ 0 ]
'satisfaction_level'
df[ 'satisfaction_level' ] . head( )
0 0.38
1 0.80
2 0.11
3 0.72
4 0.37
Name: satisfaction_level, dtype: float64
df_new= DataFrame( df, columns= [ 'satisfaction_level' , 'last_evaluation' ] , index= [ 0 , 1 , 2 , 3 , 4 , 5 ] )
df_new
satisfaction_level last_evaluation 0 0.38 0.53 1 0.80 0.86 2 0.11 0.88 3 0.72 0.87 4 0.37 0.52 5 0.41 0.50
df_new[ "n1" ] = range ( 0 , 6 )
df_new
satisfaction_level last_evaluation n1 0 0.38 0.53 0 1 0.80 0.86 1 2 0.11 0.88 2 3 0.72 0.87 3 4 0.37 0.52 4 5 0.41 0.50 5
df_new[ "n2" ] = pd. Series( np. arange( 10 , 16 ) )
df_new
satisfaction_level last_evaluation n1 n2 0 0.38 0.53 0 10 1 0.80 0.86 1 11 2 0.11 0.88 2 12 3 0.72 0.87 3 13 4 0.37 0.52 4 14 5 0.41 0.50 5 15
df_new[ "n1" ] = pd. Series( [ "A" , "B" ] , index= [ 0 , 1 ] )
df_new
satisfaction_level last_evaluation n1 n2 0 0.38 0.53 A 10 1 0.80 0.86 B 11 2 0.11 0.88 NaN 12 3 0.72 0.87 NaN 13 4 0.37 0.52 NaN 14 5 0.41 0.50 NaN 15
data= { "A" : [ 1 , 2 , 3 ] , "B" : [ 4 , 5 , 6 ] }
df_data= DataFrame( data)
df_data
df_data. T
for row in df_data. iterrows( ) :
for i in row:
print ( i)
0
A 1
B 4
Name: 0, dtype: int64
1
A 2
B 5
Name: 1, dtype: int64
2
A 3
B 6
Name: 2, dtype: int64
df1= DataFrame( np. random. rand( 8 ) . reshape( [ 4 , 2 ] ) , index= [ 'A' , 'B' , 'C' , 'D' ] , columns= [ "c1" , "c2" ] )
df1
c1 c2 A 0.875310 0.741149 B 0.872637 0.423594 C 0.878650 0.546773 D 0.215950 0.200086
df1= df1. reindex( index= [ 'A' , 'B' , 'C' , 'D' , 'E' ] , columns= [ "c1" , "c2" ] )
df1
c1 c2 A 0.567641 0.813240 B 0.630769 0.560537 C 0.477235 0.675111 D 0.223098 0.593948 E NaN NaN
df1. reindex( index= [ 'A' , 'B' ] )
c1 c2 A 0.567641 0.813240 B 0.630769 0.560537
df1. drop( 'A' , axis= 0 )
c1 c2 B 0.630769 0.560537 C 0.477235 0.675111 D 0.223098 0.593948 E NaN NaN
df1. drop( 'c1' , axis= 1 )
c2 A 0.813240 B 0.560537 C 0.675111 D 0.593948 E NaN
m= 1
n= np. nan
m+ n
nan
df1. dropna( axis= 0 )
c1 c2 A 0.567641 0.813240 B 0.630769 0.560537 C 0.477235 0.675111 D 0.223098 0.593948
df1. dropna( axis= 0 , how= 'any' )
c1 c2 A 0.567641 0.813240 B 0.630769 0.560537 C 0.477235 0.675111 D 0.223098 0.593948
df1. dropna( thresh= 2 , axis= 1 )
c1 c2 A 0.567641 0.813240 B 0.630769 0.560537 C 0.477235 0.675111 D 0.223098 0.593948 E NaN NaN
df1. fillna( value= 2 )
c1 c2 A 0.567641 0.813240 B 0.630769 0.560537 C 0.477235 0.675111 D 0.223098 0.593948 E 2.000000 2.000000
df2= DataFrame( np. arange( 16 ) . reshape( 4 , 4 ) , index= [ [ 'a' , 'a' , 'b' , 'b' ] , [ 1 , 2 , 1 , 2 ] ] , columns= [ [ 'BJ' , 'BJ' , 'SH' , 'GZ' ] , [ 8 , 9 , 6 , 7 ] ] )
df2
BJ SH GZ 8 9 6 7 a 1 0 1 2 3 2 4 5 6 7 b 1 8 9 10 11 2 12 13 14 15
df2[ "BJ" ] [ 8 ] [ "a" ] [ 2 ]
4
df1[ "c4" ] = { "A" : 1 , "B" : 2 , "C" : 3 , "D" : 4 }
df1
c1 c2 c4 A 0.875310 0.741149 A B 0.872637 0.423594 B C 0.878650 0.546773 C D 0.215950 0.200086 D
map_data= { "A" : 1 , "B" : 2 , "D" : 3 , "C" : 4 }
df1[ 'c3' ] = df1[ 'c4' ] . map ( map_data)
df1
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3
df4= df1
df4+ df1
c1 c2 c4 c3 A 1.750620 1.482299 AA 2 B 1.745274 0.847187 BB 4 C 1.757299 1.093546 CC 8 D 0.431901 0.400171 DD 6
df4. sum ( axis= 0 )
c1 2.84255
c2 1.9116
c4 ABCD
c3 10
dtype: object
df4. sum ( axis= 1 )
A 2.616459
B 3.296231
C 5.425423
D 3.416036
dtype: float64
df4. describe( )
c1 c2 c3 count 4.000000 4.000000 4.000000 mean 0.710637 0.477900 2.500000 std 0.329800 0.226696 1.290994 min 0.215950 0.200086 1.000000 25% 0.708465 0.367717 1.750000 50% 0.873973 0.485183 2.500000 75% 0.876145 0.595367 3.250000 max 0.878650 0.741149 4.000000
df4
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3
df4[ "c1" ] . sort_values( )
D 0.215950
B 0.872637
A 0.875310
C 0.878650
Name: c1, dtype: float64
df4. sort_values( "c1" )
c1 c2 c4 c3 D 0.215950 0.200086 D 3 B 0.872637 0.423594 B 2 A 0.875310 0.741149 A 1 C 0.878650 0.546773 C 4
df4. sort_index( )
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3
df4. index= [ 'a' , 'b' , 'c' , 'd' ]
df4
c1 c2 c4 c3 a 0.875310 0.741149 A 1 b 0.872637 0.423594 B 2 c 0.878650 0.546773 C 4 d 0.215950 0.200086 D 3
df4. index= df4. index. map ( str . upper)
df4
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3
df4. rename( index= str . lower, columns= str . upper)
C1 C2 C4 C3 a 0.875310 0.741149 A 1 b 0.872637 0.423594 B 2 c 0.878650 0.546773 C 4 d 0.215950 0.200086 D 3
df4. rename( index= { 'A' : "a" } )
c1 c2 c4 c3 a 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3
[ print ( x) for x in [ 1 , 2 ] ]
1
2
[None, None]
def test_map ( x) :
return x+ "?"
df8= df4. rename( index= test_map)
df5= DataFrame( { "A" : [ 1 , 2 ] , "B" : [ 3 , 4 ] , "D" : [ 5 , 6 ] } , index= [ 'a' , 'b' ] )
df6= DataFrame( { "A" : [ 1 , 4 ] , "B" : [ 3 , 4 ] , "D" : [ 5 , 6 ] } , index= [ 'a' , 'b' ] )
df5
df6
pd. merge( df5, df6)
pd. merge( df5, df6, on= "A" )
pd. merge( df5, df6, on= "A" , how= "left" )
A B_x D_x B_y D_y 0 1 3 5 3.0 5.0 1 2 4 6 NaN NaN
pd. merge( df5, df6, on= "A" , how= "right" )
A B_x D_x B_y D_y 0 1 3.0 5.0 3 5 1 4 NaN NaN 4 6
arr1= np. arange( 9 ) . reshape( 3 , 3 )
arr1
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
arr2= np. arange( 9 ) . reshape( 3 , 3 )
arr2
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
np. concatenate( [ arr1, arr2] , axis= 1 )
array([[0, 1, 2, 0, 1, 2],
[3, 4, 5, 3, 4, 5],
[6, 7, 8, 6, 7, 8]])
np. concatenate( [ arr1, arr2] , axis= 0 )
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8],
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
df1
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3
df8
c1 c2 c4 c3 A? 0.875310 0.741149 A 1 B? 0.872637 0.423594 B 2 C? 0.878650 0.546773 C 4 D? 0.215950 0.200086 D 3
pd. concat( [ df1, df8] )
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3 A? 0.875310 0.741149 A 1 B? 0.872637 0.423594 B 2 C? 0.878650 0.546773 C 4 D? 0.215950 0.200086 D 3
df20= pd. concat( [ df1, df8] , axis= 0 )
df20[ "c1" ] [ "D?" ] = np. nan
df20
D:\Anaconda\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c1 c2 c4 c3 A 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3 A? 0.875310 0.741149 A 1 B? 0.872637 0.423594 B 2 C? 0.878650 0.546773 C 4 D? NaN 0.200086 D 3
df20= df20. combine_first( df8)
df20
c1 c2 c4 c3 A 0.875310 0.741149 A 1 A? 0.875310 0.741149 A 1 B 0.872637 0.423594 B 2 B? 0.872637 0.423594 B 2 C 0.878650 0.546773 C 4 C? 0.878650 0.546773 C 4 D 0.215950 0.200086 D 3 D? 0.215950 0.200086 D 3
df20[ 'c4' ] = df20[ 'c4' ] . apply ( str . lower)
df20
c1 c2 c4 c3 A 0.875310 0.741149 a 1 A? 0.875310 0.741149 a 1 B 0.872637 0.423594 b 2 B? 0.872637 0.423594 b 2 C 0.878650 0.546773 c 4 C? 0.878650 0.546773 c 4 D 0.215950 0.200086 d 3 D? 0.215950 0.200086 d 3
def foo ( line) :
items= line+ '_A'
return items
df20[ 'c4' ] . apply ( foo)
A a_A
A? a_A
B b_A
B? b_A
C c_A
C? c_A
D d_A
D? d_A
Name: c4, dtype: object
del df20[ 'c3' ]
df20
c1 c2 c4 A 0.875310 0.741149 a A? 0.875310 0.741149 a B 0.872637 0.423594 b B? 0.872637 0.423594 b C 0.878650 0.546773 c C? 0.878650 0.546773 c D 0.215950 0.200086 d D? 0.215950 0.200086 d
df20. size
24
df20[ 'c4' ] . unique( )
array(['a', 'b', 'c', 'd'], dtype=object)
len ( df20[ 'c4' ] . unique( ) )
4
df20[ 'c4' ] . duplicated( )
A False
A? True
B False
B? True
C False
C? True
D False
D? True
Name: c4, dtype: bool
df21= df20. drop_duplicates( [ 'c4' ] )
df21
c1 c2 c4 A 0.875310 0.741149 a B 0.872637 0.423594 b C 0.878650 0.546773 c D 0.215950 0.200086 d
df22= df20. drop_duplicates( [ 'c4' ] , keep= 'last' )
df22
c1 c2 c4 A? 0.875310 0.741149 a B? 0.872637 0.423594 b C? 0.878650 0.546773 c D? 0.215950 0.200086 d
from datetime import datetime
t1= datetime( 2020 , 8 , 15 )
t1
datetime.datetime(2020, 8, 15, 0, 0)
df22= df22. rename( index= { "A?" : datetime( 2020 , 8 , 1 ) , "B?" : datetime( 2020 , 8 , 2 ) , "C?" : datetime( 2020 , 9 , 1 ) , "D?" : datetime( 2020 , 9 , 2 ) } )
df22
c1 c2 c4 c3 2020-08-01 0.875310 0.741149 a 2020-08-01 2020-08-02 0.872637 0.423594 b 2020-08-02 2020-09-01 0.878650 0.546773 c 2020-09-01 2020-09-02 0.215950 0.200086 d 2020-09-02
df22[ "2020-09" ]
c1 c2 c4 c3 2020-09-01 0.87865 0.546773 c 2020-09-01 2020-09-02 0.21595 0.200086 d 2020-09-02
df22[ "2020" ]
c1 c2 c4 c3 2020-08-01 0.875310 0.741149 a 2020-08-01 2020-08-02 0.872637 0.423594 b 2020-08-02 2020-09-01 0.878650 0.546773 c 2020-09-01 2020-09-02 0.215950 0.200086 d 2020-09-02
df22[ [ "c1" , "c2" ] ]
c1 c2 2020-08-01 0.875310 0.741149 2020-08-02 0.872637 0.423594 2020-09-01 0.878650 0.546773 2020-09-02 0.215950 0.200086
df22. ix[ "2020-09" ]
D:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
c1 c2 c4 c3 2020-09-01 0.87865 0.546773 c 2020-09-01 2020-09-02 0.21595 0.200086 d 2020-09-02
date_list_new= pd. date_range( "2020-01-01" , "2021-12-31" )
date_list_new
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
'2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
'2020-01-09', '2020-01-10',
...
'2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
'2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
'2021-12-30', '2021-12-31'],
dtype='datetime64[ns]', length=731, freq='D')
ss= Series( np. random. randn( len ( date_list_new) ) , index= date_list_new)
ss
2020-01-01 0.105096
2020-01-02 1.184459
2020-01-03 0.131104
2020-01-04 -1.421972
2020-01-05 -1.078701
...
2021-12-27 -1.696502
2021-12-28 -1.013481
2021-12-29 1.865331
2021-12-30 -0.036189
2021-12-31 -0.294095
Freq: D, Length: 731, dtype: float64
ss_month= ss. resample( "M" ) . mean( )
ss_month
2020-01-31 -0.248221
2020-02-29 -0.095330
2020-03-31 0.166915
2020-04-30 -0.168161
2020-05-31 0.036389
2020-06-30 -0.253172
2020-07-31 0.160701
2020-08-31 0.399025
2020-09-30 0.154235
2020-10-31 -0.017248
2020-11-30 0.132322
2020-12-31 -0.135380
2021-01-31 -0.131289
2021-02-28 0.006788
2021-03-31 -0.342274
2021-04-30 0.008750
2021-05-31 -0.253172
2021-06-30 -0.093458
2021-07-31 0.347516
2021-08-31 -0.334457
2021-09-30 -0.138226
2021-10-31 0.148497
2021-11-30 -0.275131
2021-12-31 0.015029
Freq: M, dtype: float64
ss. resample( "H" ) . ffill( )
2020-01-01 00:00:00 0.105096
2020-01-01 01:00:00 0.105096
2020-01-01 02:00:00 0.105096
2020-01-01 03:00:00 0.105096
2020-01-01 04:00:00 0.105096
...
2021-12-30 20:00:00 -0.036189
2021-12-30 21:00:00 -0.036189
2021-12-30 22:00:00 -0.036189
2021-12-30 23:00:00 -0.036189
2021-12-31 00:00:00 -0.294095
Freq: H, Length: 17521, dtype: float64
ss
2020-01-01 0.105096
2020-01-02 1.184459
2020-01-03 0.131104
2020-01-04 -1.421972
2020-01-05 -1.078701
...
2021-12-27 -1.696502
2021-12-28 -1.013481
2021-12-29 1.865331
2021-12-30 -0.036189
2021-12-31 -0.294095
Freq: D, Length: 731, dtype: float64
score= np. random. randint( 25 , 100 , size= 20 )
score
array([29, 49, 71, 81, 77, 82, 98, 42, 95, 77, 61, 41, 86, 80, 64, 66, 89,
86, 50, 56])
bins= [ 0 , 59 , 70 , 80 , 100 ]
pd. cut( score, bins)
[(0, 59], (0, 59], (70, 80], (80, 100], (70, 80], ..., (59, 70], (80, 100], (80, 100], (0, 59], (0, 59]]
Length: 20
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
score_cat= pd. cut( score, bins)
pd. value_counts( score_cat)
(80, 100] 7
(0, 59] 6
(70, 80] 4
(59, 70] 3
dtype: int64
score_cat= pd. cut( score, bins, labels= [ 'low' , 'ok' , 'good' , 'great' ] )
score_cat
[low, low, good, great, good, ..., ok, great, great, low, low]
Length: 20
Categories (4, object): [low < ok < good < great]
dfscore= DataFrame( { "score" : score, "d" : score_cat} )
dfscore
score d 0 29 low 1 49 low 2 71 good 3 81 great 4 77 good 5 82 great 6 98 great 7 42 low 8 95 great 9 77 good 10 61 ok 11 41 low 12 86 great 13 80 good 14 64 ok 15 66 ok 16 89 great 17 86 great 18 50 low 19 56 low
g= dfscore. groupby( dfscore[ "d" ] )
g. groups
{'low': Int64Index([0, 1, 7, 11, 18, 19], dtype='int64'),
'ok': Int64Index([10, 14, 15], dtype='int64'),
'good': Int64Index([2, 4, 9, 13], dtype='int64'),
'great': Int64Index([3, 5, 6, 8, 12, 16, 17], dtype='int64')}
g1= g. get_group( "low" )
g1
score d 0 29 low 1 49 low 7 42 low 11 41 low 18 50 low 19 56 low
g1. mean( )
score 44.5
dtype: float64
g1. max ( )
score 56
d low
dtype: object
list ( g)
[('low', score d
0 29 low
1 49 low
7 42 low
11 41 low
18 50 low
19 56 low), ('ok', score d
10 61 ok
14 64 ok
15 66 ok), ('good', score d
2 71 good
4 77 good
9 77 good
13 80 good), ('great', score d
3 81 great
5 82 great
6 98 great
8 95 great
12 86 great
16 89 great
17 86 great)]
gd= dict ( list ( g) )
gd
{'low': score d
0 29 low
1 49 low
7 42 low
11 41 low
18 50 low
19 56 low, 'ok': score d
10 61 ok
14 64 ok
15 66 ok, 'good': score d
2 71 good
4 77 good
9 77 good
13 80 good, 'great': score d
3 81 great
5 82 great
6 98 great
8 95 great
12 86 great
16 89 great
17 86 great}
gd[ "low" ]
score d 0 29 low 1 49 low 7 42 low 11 41 low 18 50 low 19 56 low
def foo ( data) :
return data. min ( )
gd[ "low" ] . agg( foo)
score 29
d low
dtype: object
df= pd. read_csv( "E:\微云\Python3数据分析与挖掘建模实战\书籍+随堂源码+说明\sample_code\data\HR.csv" )
df. head( )
satisfaction_level last_evaluation number_project average_monthly_hours time_spend_company Work_accident left promotion_last_5years department salary 0 0.38 0.53 2 157 3 0 1 0 sales low 1 0.80 0.86 5 262 6 0 1 0 sales medium 2 0.11 0.88 7 272 4 0 1 0 sales medium 3 0.72 0.87 5 223 5 0 1 0 sales low 4 0.37 0.52 2 159 3 0 1 0 sales low
pd. pivot_table( df, index= [ 'satisfaction_level' , 'department' ] )
Work_accident average_monthly_hours last_evaluation left number_project promotion_last_5years time_spend_company satisfaction_level department 0.09 IT 0.000000 271.333333 0.874444 1.0 5.944444 0.000000 4.333333 RandD 0.600000 265.000000 0.954000 1.0 6.600000 0.000000 4.000000 accounting 0.000000 278.250000 0.786250 1.0 6.125000 0.000000 4.000000 hr 0.000000 296.000000 0.852857 1.0 6.214286 0.000000 4.214286 management 0.000000 292.666667 0.863333 1.0 6.666667 0.000000 4.000000 ... ... ... ... ... ... ... ... ... 1.00 marketing 0.000000 222.750000 0.897500 0.0 4.500000 0.000000 2.000000 product_mng 0.428571 209.714286 0.718571 0.0 3.714286 0.000000 3.000000 sales 0.166667 196.733333 0.774667 0.0 3.766667 0.000000 3.500000 support 0.166667 185.611111 0.793333 0.0 3.611111 0.111111 3.388889 technical 0.230769 200.384615 0.708462 0.0 4.076923 0.000000 3.000000
901 rows × 7 columns
df2. to_csv( "E://aa.csv" )