dfply

from dfply import data
 
diamonds = data.diamonds
 
import pandas as pd
print(diamonds.loc[:,['cut','price','x']].head(3))
 
print('------------------------------------------')
 
print(diamonds[['cut','price','x']].head(3))
 
print(diamonds.iloc[:,[1,6,7]].head(3))
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
------------------------------------------
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
diamonds.iloc[[10,15],:].head(3)
caratcutcolorclaritydepthtablepricexyz
100.30GoodJSI164.055.03394.254.282.73
150.32PremiumEI160.958.03454.384.422.68
print(diamonds.iloc[[10,15],:].head(3))
    carat      cut color clarity  depth  table  price     x     y     z
10   0.30     Good     J     SI1   64.0   55.0    339  4.25  4.28  2.73
15   0.32  Premium     E      I1   60.9   58.0    345  4.38  4.42  2.68
from dfply import *

diamonds >> drop(1, X.price, ['x', 'y']) >> head(2)
caratcolorclaritydepthtablez
00.23ESI261.555.02.43
10.21ESI159.861.02.31
# 'data' is the original pandas DataFrame
(diamonds >>
select(X.carat, X.cut, X.color) >>
drop(X.cut) >>
head(3))
caratcolor
00.23E
10.21E
20.23E
diamonds.head()
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
20.23GoodEVS156.965.03274.054.072.31
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
diamonds >> drop(1, X.price, ['x', 'y']) >> head(5)
caratcolorclaritydepthtablez
00.23ESI261.555.02.43
10.21ESI159.861.02.31
20.23EVS156.965.02.31
30.29IVS262.458.02.63
40.31JSI263.358.02.75
(diamonds >>
 select(X.carat, X.cut, X.color) >>
 select(~X.cut) >>
 head(3))

caratcolor
00.23E
10.21E
20.23E
df = (diamonds >>
select(X.carat, X.cut, X.price) >>
mask(X.carat > 0.21,  X.cut=='Fair', X.price < 563))
df
caratcutprice
80.22Fair337
100300.30Fair416
282700.25Fair361
316110.23Fair369
316150.27Fair371
405910.30Fair496
405970.34Fair497
469610.37Fair527
486300.30Fair536
df = (diamonds >>
      select(X.carat, X.cut, X.price) >>
      mask(X.carat > 0.21,  X.cut=='Fair', X.price < 563) >>
      arrange(X.carat,X.price))#按carat,然后按price排序
df
caratcutprice
80.22Fair337
316110.23Fair369
282700.25Fair361
316150.27Fair371
100300.30Fair416
405910.30Fair496
486300.30Fair536
405970.34Fair497
469610.37Fair527
(diamonds >> 
select(starts_with('c')) >>
 head(2))
caratcutcolorclarity
00.23IdealESI2
10.21PremiumESI1
diamonds >> select(~starts_with('c')) >> head(2)
depthtablepricexyz
061.555.03263.953.982.43
159.861.03263.893.842.31
(diamonds >> 
     select(contains('c')) >>
      head(2))
caratcutcolorclarityprice
00.23IdealESI2326
10.21PremiumESI1326
diamonds >>head(2)
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
diamonds >> select(columns_to(X.price)) >> head(2)#到某一行之前
caratcutcolorclaritydepthtable
00.23IdealESI261.555.0
10.21PremiumESI159.861.0
diamonds >> select(columns_from(X.table)) >> head(2)
tablepricexyz
055.03263.953.982.43
161.03263.893.842.31
diamonds >> select(columns_between(X.carat,X.table)) >> head(2)
caratcutcolorclaritydepthtable
00.23IdealESI261.555.0
10.21PremiumESI159.861.0
diamonds >> row_slice([10,15])#row_slice()函数选择切片行
caratcutcolorclaritydepthtablepricexyz
100.30GoodJSI164.055.03394.254.282.73
150.32PremiumEI160.958.03454.384.422.68
diamonds >> sample(frac=0.0001, replace=False)
caratcutcolorclaritydepthtablepricexyz
118491.04PremiumHVVS160.458.051026.586.533.96
222881.66IdealFSI262.659.0103387.537.574.73
130161.20Very GoodISI162.255.054086.756.924.25
27100.31IdealGSI161.655.05634.344.372.68
254481.50PremiumFVS261.158.0141997.367.334.49
diamonds >> sample(n=3, replace=True)
caratcutcolorclaritydepthtablepricexyz
137751.70FairDI164.756.056177.467.374.80
60430.32IdealESI262.055.05764.414.362.72
332080.33IdealHIF61.655.08204.444.472.74
diamonds >> distinct(X.color)
caratcutcolorclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
30.29PremiumIVS262.458.03344.204.232.63
40.31GoodJSI263.358.03354.344.352.75
70.26Very GoodHSI161.955.03374.074.112.53
120.22PremiumFSI160.461.03423.883.842.33
250.23Very GoodGVVS260.458.03543.974.012.41
280.23Very GoodDVS260.561.03573.963.972.40
(diamonds
 >> filter_by(X.cut == 'Ideal', X.color == 'E', X.table < 55, X.price < 500)
 >> pull('carat'))
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\subset.py:87: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  return df.ix[:, column]





26683    0.33
32297    0.34
40928    0.30
50623    0.30
50625    0.30
Name: carat, dtype: float64
diamonds >> mutate(x_plus_y=X.x + X.y) >> select(columns_from('x')) >> head(3)
xyzx_plus_y
03.953.982.437.93
13.893.842.317.73
24.054.072.318.12
diamonds >> mutate(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> select(columns_from('x')) >> head(3)
xyzx_plus_yy_div_z
03.953.982.437.931.637860
13.893.842.317.731.662338
24.054.072.318.121.761905
diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3)
x_plus_yy_div_z
07.931.637860
17.731.662338
28.121.761905
diamonds >> arrange(X.table, ascending=False) >> head(5)
caratcutcolorclaritydepthtablepricexyz
249322.01FairFSI158.695.0133878.328.314.87
507730.81FairFSI268.879.023015.265.203.58
513420.79FairGSI165.376.023625.525.133.35
528600.50FairEVS279.073.025795.215.184.09
493750.70FairHVS162.073.021005.655.543.47
(diamonds >> group_by(X.cut) >> arrange(X.price) >>
 head(3) >> ungroup() >> mask(X.carat < 0.23))
caratcutcolorclaritydepthtablepricexyz
80.22FairEVS265.161.03373.873.782.49
10.21PremiumESI159.861.03263.893.842.31
120.22PremiumFSI160.461.03423.883.842.33
diamonds >> rename(CUT=X.cut, COLOR='color') >> head(2)
caratCUTCOLORclaritydepthtablepricexyz
00.23IdealESI261.555.03263.953.982.43
10.21PremiumESI159.861.03263.893.842.31
diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5)
#gather()函数将DataFrame中的指定列融合为两个键 :variable和value。
caratcutcolorclaritytablevariablevalue
00.23IdealESI255.0price326.0
10.21PremiumESI161.0price326.0
20.23GoodEVS165.0price327.0
30.29PremiumIVS258.0price334.0
40.31GoodJSI258.0price335.0
diamonds >> gather('variable', 'value') >> head(5)
variablevalue
0carat0.23
1carat0.21
2carat0.23
3carat0.29
4carat0.31
,,,
print(d)

a  b      c
0  1  a   True
1  2  b  False
2  3  c    NaN

d >> unite('united', X.a, 'b', 2, remove=False, na_action='maintain')

   a  b      c     united
0  1  a   True   1_a_True
1  2  b  False  2_b_False
2  3  c    NaN        NaN

d >> unite('united', ['a','b','c'], remove=True, na_action='ignore', sep='*')

      united
0   1*a*True
1  2*b*False
2        3*c

d >> unite('united', d.columns, remove=True, na_action='as_string')

      united
0   1_a_True
1  2_b_False
2    3_c_nan
''''''
  File "<ipython-input-84-a4809ea807f5>", line 4
    a  b      c
       ^
SyntaxError: invalid syntax
a = pd.DataFrame({
        'x1':['A','B','C'],
        'x2':[1,2,3]
    })
b = pd.DataFrame({
    'x1':['A','B','D'],
    'x3':[True,False,True]
})
a
x1x2
0A1
1B2
2C3
b
x1x3
0ATrue
1BFalse
2DTrue
a >> inner_join(b, by='x1')

x1x2x3
0A1True
1B2False
a >> outer_join(b, by='x1')

x1x2x3
0A1.0True
1B2.0False
2C3.0NaN
3DNaNTrue
a >> left_join(b, by='x1')
x1x2x3
0A1True
1B2False
2C3NaN
a >> right_join(b, by='x1')
x1x2x3
0A1.0True
1B2.0False
2DNaNTrue
a >> semi_join(b, by='x1')
#returns all of the rows in the left DataFrame that have a match in the right DataFrame in the by columns.

x1x2
0A1
1B2
a >> anti_join(b, by='x1')
x1x2
2C3
a >> bind_rows(b, join='inner')
x1
0A
1B
2C
0A
1B
2D
a >> bind_rows(b, join='outer')
c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\join.py:279: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)
x1x2x3
0A1.0NaN
1B2.0NaN
2C3.0NaN
0ANaNTrue
1BNaNFalse
2DNaNTrue
a >> bind_cols(b)
x1x2x1x3
0A1ATrue
1B2BFalse
2C3DTrue
diamonds >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
price_meanprice_std
03932.7997223989.439738
diamonds >> group_by('cut') >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
#summarize() 可以很好的搭配group_by函数
cutprice_meanprice_std
0Fair4358.7577643560.386612
1Good3928.8644523681.589584
2Ideal3457.5419703808.401172
3Premium4584.2577044349.204961
4Very Good3981.7598913935.862161
#summarize_each(function_list, *columns)更常用,它可以比较方便的计算数据框中的列的一些属性。
diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')

price_meanprice_vardepth_meandepth_var
03932.7997221.591533e+0761.7494052.052366
diamonds >> group_by(X.cut) >> summarize_each([np.mean, np.var], X.price, 4)
cutprice_meanprice_vardepth_meandepth_var
0Fair4358.7577641.266848e+0764.04167713.266319
1Good3928.8644521.355134e+0762.3658794.705224
2Ideal3457.5419701.450325e+0761.7094010.516274
3Premium4584.2577041.891421e+0761.2646731.342755
4Very Good3981.7598911.548973e+0761.8182751.900466
(diamonds >> mutate(price_lead=lead(X.price, 2), price_lag=lag(X.price, 2)) >>
            select(X.price, -2, -1) >>
            head(6))

#lead(series,n)函数向上推动向量中的值,在末尾位置添加NaN值。 同样,lag函数向下推动值,在初始位置插入NaN值。
priceprice_leadprice_lag
0326327.0NaN
1326334.0NaN
2327335.0326.0
3334336.0326.0
4335336.0327.0
5336337.0334.0
diamonds >> select(X.price) >> mutate(price_btwn=between(X.price, 330, 340)) >> head(6)
priceprice_btwn
0326False
1326False
2327False
3334True
4335True
5336True
#dense_rank() 函数
#计算重复的排序值或者秩
diamonds >> select(X.price) >> mutate(price_drank=dense_rank(X.price)) >> head(6)

priceprice_drank
03261.0
13261.0
23272.0
33343.0
43354.0
53365.0
diamonds >> select(X.price) >> mutate(price_mrank=min_rank(X.price)) >> head(6)
priceprice_mrank
03261.0
13261.0
23273.0
33344.0
43355.0
53366.0
diamonds >> select(X.price) >> mutate(price_cumsum=cumsum(X.price)) >> head(6)
priceprice_cumsum
0326326
1326652
2327979
33341313
43351648
53361984
diamonds >> select(X.price) >> mutate(price_cummean=cummean(X.price)) >> head(6)
#cummean() 函数计算列的累积平均值。
priceprice_cummean
0326326.000000
1326326.000000
2327326.333333
3334328.250000
4335329.600000
5336330.666667
diamonds >> select(X.price) >> mutate(price_cummax=cummax(X.price)) >> head(6)
priceprice_cummax
0326326.0
1326326.0
2327327.0
3334334.0
4335335.0
5336336.0
diamonds >> select(X.price) >> mutate(price_cummin=cummin(X.price)) >> head(6)
priceprice_cummin
0326326.0
1326326.0
2327326.0
3334326.0
4335326.0
5336326.0

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值