from dfply import data
diamonds = data. diamonds
import pandas as pd
print ( diamonds. loc[ : , [ 'cut' , 'price' , 'x' ] ] . head( 3 ) )
print ( '------------------------------------------' )
print ( diamonds[ [ 'cut' , 'price' , 'x' ] ] . head( 3 ) )
print ( diamonds. iloc[ : , [ 1 , 6 , 7 ] ] . head( 3 ) )
cut price x
0 Ideal 326 3.95
1 Premium 326 3.89
2 Good 327 4.05
------------------------------------------
cut price x
0 Ideal 326 3.95
1 Premium 326 3.89
2 Good 327 4.05
cut price x
0 Ideal 326 3.95
1 Premium 326 3.89
2 Good 327 4.05
diamonds. iloc[ [ 10 , 15 ] , : ] . head( 3 )
carat cut color clarity depth table price x y z 10 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73 15 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
print ( diamonds. iloc[ [ 10 , 15 ] , : ] . head( 3 ) )
carat cut color clarity depth table price x y z
10 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73
15 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
from dfply import *
diamonds >> drop( 1 , X. price, [ 'x' , 'y' ] ) >> head( 2 )
carat color clarity depth table z 0 0.23 E SI2 61.5 55.0 2.43 1 0.21 E SI1 59.8 61.0 2.31
( diamonds >>
select( X. carat, X. cut, X. color) >>
drop( X. cut) >>
head( 3 ) )
carat color 0 0.23 E 1 0.21 E 2 0.23 E
diamonds. head( )
carat cut color clarity depth table price x y z 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
diamonds >> drop( 1 , X. price, [ 'x' , 'y' ] ) >> head( 5 )
carat color clarity depth table z 0 0.23 E SI2 61.5 55.0 2.43 1 0.21 E SI1 59.8 61.0 2.31 2 0.23 E VS1 56.9 65.0 2.31 3 0.29 I VS2 62.4 58.0 2.63 4 0.31 J SI2 63.3 58.0 2.75
( diamonds >>
select( X. carat, X. cut, X. color) >>
select( ~ X. cut) >>
head( 3 ) )
carat color 0 0.23 E 1 0.21 E 2 0.23 E
df = ( diamonds >>
select( X. carat, X. cut, X. price) >>
mask( X. carat > 0.21 , X. cut== 'Fair' , X. price < 563 ) )
df
carat cut price 8 0.22 Fair 337 10030 0.30 Fair 416 28270 0.25 Fair 361 31611 0.23 Fair 369 31615 0.27 Fair 371 40591 0.30 Fair 496 40597 0.34 Fair 497 46961 0.37 Fair 527 48630 0.30 Fair 536
df = ( diamonds >>
select( X. carat, X. cut, X. price) >>
mask( X. carat > 0.21 , X. cut== 'Fair' , X. price < 563 ) >>
arrange( X. carat, X. price) )
df
carat cut price 8 0.22 Fair 337 31611 0.23 Fair 369 28270 0.25 Fair 361 31615 0.27 Fair 371 10030 0.30 Fair 416 40591 0.30 Fair 496 48630 0.30 Fair 536 40597 0.34 Fair 497 46961 0.37 Fair 527
( diamonds >>
select( starts_with( 'c' ) ) >>
head( 2 ) )
carat cut color clarity 0 0.23 Ideal E SI2 1 0.21 Premium E SI1
diamonds >> select( ~ starts_with( 'c' ) ) >> head( 2 )
depth table price x y z 0 61.5 55.0 326 3.95 3.98 2.43 1 59.8 61.0 326 3.89 3.84 2.31
( diamonds >>
select( contains( 'c' ) ) >>
head( 2 ) )
carat cut color clarity price 0 0.23 Ideal E SI2 326 1 0.21 Premium E SI1 326
diamonds >> head( 2 )
carat cut color clarity depth table price x y z 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
diamonds >> select( columns_to( X. price) ) >> head( 2 )
carat cut color clarity depth table 0 0.23 Ideal E SI2 61.5 55.0 1 0.21 Premium E SI1 59.8 61.0
diamonds >> select( columns_from( X. table) ) >> head( 2 )
table price x y z 0 55.0 326 3.95 3.98 2.43 1 61.0 326 3.89 3.84 2.31
diamonds >> select( columns_between( X. carat, X. table) ) >> head( 2 )
carat cut color clarity depth table 0 0.23 Ideal E SI2 61.5 55.0 1 0.21 Premium E SI1 59.8 61.0
diamonds >> row_slice( [ 10 , 15 ] )
carat cut color clarity depth table price x y z 10 0.30 Good J SI1 64.0 55.0 339 4.25 4.28 2.73 15 0.32 Premium E I1 60.9 58.0 345 4.38 4.42 2.68
diamonds >> sample( frac= 0.0001 , replace= False )
carat cut color clarity depth table price x y z 11849 1.04 Premium H VVS1 60.4 58.0 5102 6.58 6.53 3.96 22288 1.66 Ideal F SI2 62.6 59.0 10338 7.53 7.57 4.73 13016 1.20 Very Good I SI1 62.2 55.0 5408 6.75 6.92 4.25 2710 0.31 Ideal G SI1 61.6 55.0 563 4.34 4.37 2.68 25448 1.50 Premium F VS2 61.1 58.0 14199 7.36 7.33 4.49
diamonds >> sample( n= 3 , replace= True )
carat cut color clarity depth table price x y z 13775 1.70 Fair D I1 64.7 56.0 5617 7.46 7.37 4.80 6043 0.32 Ideal E SI2 62.0 55.0 576 4.41 4.36 2.72 33208 0.33 Ideal H IF 61.6 55.0 820 4.44 4.47 2.74
diamonds >> distinct( X. color)
carat cut color clarity depth table price x y z 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 7 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 12 0.22 Premium F SI1 60.4 61.0 342 3.88 3.84 2.33 25 0.23 Very Good G VVS2 60.4 58.0 354 3.97 4.01 2.41 28 0.23 Very Good D VS2 60.5 61.0 357 3.96 3.97 2.40
( diamonds
>> filter_by( X. cut == 'Ideal' , X. color == 'E' , X. table < 55 , X. price < 500 )
>> pull( 'carat' ) )
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\subset.py:87: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
return df.ix[:, column]
26683 0.33
32297 0.34
40928 0.30
50623 0.30
50625 0.30
Name: carat, dtype: float64
diamonds >> mutate( x_plus_y= X. x + X. y) >> select( columns_from( 'x' ) ) >> head( 3 )
x y z x_plus_y 0 3.95 3.98 2.43 7.93 1 3.89 3.84 2.31 7.73 2 4.05 4.07 2.31 8.12
diamonds >> mutate( x_plus_y= X. x + X. y, y_div_z= ( X. y / X. z) ) >> select( columns_from( 'x' ) ) >> head( 3 )
x y z x_plus_y y_div_z 0 3.95 3.98 2.43 7.93 1.637860 1 3.89 3.84 2.31 7.73 1.662338 2 4.05 4.07 2.31 8.12 1.761905
diamonds >> transmute( x_plus_y= X. x + X. y, y_div_z= ( X. y / X. z) ) >> head( 3 )
x_plus_y y_div_z 0 7.93 1.637860 1 7.73 1.662338 2 8.12 1.761905
diamonds >> arrange( X. table, ascending= False ) >> head( 5 )
carat cut color clarity depth table price x y z 24932 2.01 Fair F SI1 58.6 95.0 13387 8.32 8.31 4.87 50773 0.81 Fair F SI2 68.8 79.0 2301 5.26 5.20 3.58 51342 0.79 Fair G SI1 65.3 76.0 2362 5.52 5.13 3.35 52860 0.50 Fair E VS2 79.0 73.0 2579 5.21 5.18 4.09 49375 0.70 Fair H VS1 62.0 73.0 2100 5.65 5.54 3.47
( diamonds >> group_by( X. cut) >> arrange( X. price) >>
head( 3 ) >> ungroup( ) >> mask( X. carat < 0.23 ) )
carat cut color clarity depth table price x y z 8 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 12 0.22 Premium F SI1 60.4 61.0 342 3.88 3.84 2.33
diamonds >> rename( CUT= X. cut, COLOR= 'color' ) >> head( 2 )
carat CUT COLOR clarity depth table price x y z 0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
diamonds >> gather( 'variable' , 'value' , [ 'price' , 'depth' , 'x' , 'y' , 'z' ] ) >> head( 5 )
carat cut color clarity table variable value 0 0.23 Ideal E SI2 55.0 price 326.0 1 0.21 Premium E SI1 61.0 price 326.0 2 0.23 Good E VS1 65.0 price 327.0 3 0.29 Premium I VS2 58.0 price 334.0 4 0.31 Good J SI2 58.0 price 335.0
diamonds >> gather( 'variable' , 'value' ) >> head( 5 )
variable value 0 carat 0.23 1 carat 0.21 2 carat 0.23 3 carat 0.29 4 carat 0.31
, , ,
print ( d)
a b c
0 1 a True
1 2 b False
2 3 c NaN
d >> unite( 'united' , X. a, 'b' , 2 , remove= False , na_action= 'maintain' )
a b c united
0 1 a True 1_a_True
1 2 b False 2_b_False
2 3 c NaN NaN
d >> unite( 'united' , [ 'a' , 'b' , 'c' ] , remove= True , na_action= 'ignore' , sep= '*' )
united
0 1 * a* True
1 2 * b* False
2 3 * c
d >> unite( 'united' , d. columns, remove= True , na_action= 'as_string' )
united
0 1_a_True
1 2_b_False
2 3_c_nan
'' '' ''
File "<ipython-input-84-a4809ea807f5>", line 4
a b c
^
SyntaxError: invalid syntax
a = pd. DataFrame( {
'x1' : [ 'A' , 'B' , 'C' ] ,
'x2' : [ 1 , 2 , 3 ]
} )
b = pd. DataFrame( {
'x1' : [ 'A' , 'B' , 'D' ] ,
'x3' : [ True , False , True ]
} )
a
b
a >> inner_join( b, by= 'x1' )
a >> outer_join( b, by= 'x1' )
x1 x2 x3 0 A 1.0 True 1 B 2.0 False 2 C 3.0 NaN 3 D NaN True
a >> left_join( b, by= 'x1' )
x1 x2 x3 0 A 1 True 1 B 2 False 2 C 3 NaN
a >> right_join( b, by= 'x1' )
x1 x2 x3 0 A 1.0 True 1 B 2.0 False 2 D NaN True
a >> semi_join( b, by= 'x1' )
a >> anti_join( b, by= 'x1' )
a >> bind_rows( b, join= 'inner' )
a >> bind_rows( b, join= 'outer' )
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\join.py:279: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
x1 x2 x3 0 A 1.0 NaN 1 B 2.0 NaN 2 C 3.0 NaN 0 A NaN True 1 B NaN False 2 D NaN True
a >> bind_cols( b)
x1 x2 x1 x3 0 A 1 A True 1 B 2 B False 2 C 3 D True
diamonds >> summarize( price_mean= X. price. mean( ) , price_std= X. price. std( ) )
price_mean price_std 0 3932.799722 3989.439738
diamonds >> group_by( 'cut' ) >> summarize( price_mean= X. price. mean( ) , price_std= X. price. std( ) )
cut price_mean price_std 0 Fair 4358.757764 3560.386612 1 Good 3928.864452 3681.589584 2 Ideal 3457.541970 3808.401172 3 Premium 4584.257704 4349.204961 4 Very Good 3981.759891 3935.862161
diamonds >> summarize_each( [ np. mean, np. var] , X. price, 'depth' )
price_mean price_var depth_mean depth_var 0 3932.799722 1.591533e+07 61.749405 2.052366
diamonds >> group_by( X. cut) >> summarize_each( [ np. mean, np. var] , X. price, 4 )
cut price_mean price_var depth_mean depth_var 0 Fair 4358.757764 1.266848e+07 64.041677 13.266319 1 Good 3928.864452 1.355134e+07 62.365879 4.705224 2 Ideal 3457.541970 1.450325e+07 61.709401 0.516274 3 Premium 4584.257704 1.891421e+07 61.264673 1.342755 4 Very Good 3981.759891 1.548973e+07 61.818275 1.900466
( diamonds >> mutate( price_lead= lead( X. price, 2 ) , price_lag= lag( X. price, 2 ) ) >>
select( X. price, - 2 , - 1 ) >>
head( 6 ) )
price price_lead price_lag 0 326 327.0 NaN 1 326 334.0 NaN 2 327 335.0 326.0 3 334 336.0 326.0 4 335 336.0 327.0 5 336 337.0 334.0
diamonds >> select( X. price) >> mutate( price_btwn= between( X. price, 330 , 340 ) ) >> head( 6 )
price price_btwn 0 326 False 1 326 False 2 327 False 3 334 True 4 335 True 5 336 True
diamonds >> select( X. price) >> mutate( price_drank= dense_rank( X. price) ) >> head( 6 )
price price_drank 0 326 1.0 1 326 1.0 2 327 2.0 3 334 3.0 4 335 4.0 5 336 5.0
diamonds >> select( X. price) >> mutate( price_mrank= min_rank( X. price) ) >> head( 6 )
price price_mrank 0 326 1.0 1 326 1.0 2 327 3.0 3 334 4.0 4 335 5.0 5 336 6.0
diamonds >> select( X. price) >> mutate( price_cumsum= cumsum( X. price) ) >> head( 6 )
price price_cumsum 0 326 326 1 326 652 2 327 979 3 334 1313 4 335 1648 5 336 1984
diamonds >> select( X. price) >> mutate( price_cummean= cummean( X. price) ) >> head( 6 )
price price_cummean 0 326 326.000000 1 326 326.000000 2 327 326.333333 3 334 328.250000 4 335 329.600000 5 336 330.666667
diamonds >> select( X. price) >> mutate( price_cummax= cummax( X. price) ) >> head( 6 )
price price_cummax 0 326 326.0 1 326 326.0 2 327 327.0 3 334 334.0 4 335 335.0 5 336 336.0
diamonds >> select( X. price) >> mutate( price_cummin= cummin( X. price) ) >> head( 6 )
price price_cummin 0 326 326.0 1 326 326.0 2 327 326.0 3 334 326.0 4 335 326.0 5 336 326.0