dfply

最新推荐文章于 2021-03-01 15:47:26 发布

qq_39306047

最新推荐文章于 2021-03-01 15:47:26 发布

阅读量595

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/qq_39306047/article/details/105505129

版权

python 专栏收录该内容

58 篇文章 1 订阅

订阅专栏

from dfply import data
 
diamonds = data.diamonds
 
import pandas as pd

print(diamonds.loc[:,['cut','price','x']].head(3))
 
print('------------------------------------------')
 
print(diamonds[['cut','price','x']].head(3))
 
print(diamonds.iloc[:,[1,6,7]].head(3))

       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
------------------------------------------
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05
       cut  price     x
0    Ideal    326  3.95
1  Premium    326  3.89
2     Good    327  4.05

diamonds.iloc[[10,15],:].head(3)

	carat	cut	color	clarity	depth	table	price	x	y	z
10	0.30	Good	J	SI1	64.0	55.0	339	4.25	4.28	2.73
15	0.32	Premium	E	I1	60.9	58.0	345	4.38	4.42	2.68

print(diamonds.iloc[[10,15],:].head(3))

    carat      cut color clarity  depth  table  price     x     y     z
10   0.30     Good     J     SI1   64.0   55.0    339  4.25  4.28  2.73
15   0.32  Premium     E      I1   60.9   58.0    345  4.38  4.42  2.68

from dfply import *

diamonds >> drop(1, X.price, ['x', 'y']) >> head(2)

	carat	color	clarity	depth	table	z
0	0.23	E	SI2	61.5	55.0	2.43
1	0.21	E	SI1	59.8	61.0	2.31

# 'data' is the original pandas DataFrame
(diamonds >>
select(X.carat, X.cut, X.color) >>
drop(X.cut) >>
head(3))

	carat	color
0	0.23	E
1	0.21	E
2	0.23	E

diamonds.head()

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75

diamonds >> drop(1, X.price, ['x', 'y']) >> head(5)

	carat	color	clarity	depth	table	z
0	0.23	E	SI2	61.5	55.0	2.43
1	0.21	E	SI1	59.8	61.0	2.31
2	0.23	E	VS1	56.9	65.0	2.31
3	0.29	I	VS2	62.4	58.0	2.63
4	0.31	J	SI2	63.3	58.0	2.75

(diamonds >>
 select(X.carat, X.cut, X.color) >>
 select(~X.cut) >>
 head(3))

	carat	color
0	0.23	E
1	0.21	E
2	0.23	E

df = (diamonds >>
select(X.carat, X.cut, X.price) >>
mask(X.carat > 0.21,  X.cut=='Fair', X.price < 563))
df

	carat	cut	price
8	0.22	Fair	337
10030	0.30	Fair	416
28270	0.25	Fair	361
31611	0.23	Fair	369
31615	0.27	Fair	371
40591	0.30	Fair	496
40597	0.34	Fair	497
46961	0.37	Fair	527
48630	0.30	Fair	536

df = (diamonds >>
      select(X.carat, X.cut, X.price) >>
      mask(X.carat > 0.21,  X.cut=='Fair', X.price < 563) >>
      arrange(X.carat,X.price))#按carat，然后按price排序
df

	carat	cut	price
8	0.22	Fair	337
31611	0.23	Fair	369
28270	0.25	Fair	361
31615	0.27	Fair	371
10030	0.30	Fair	416
40591	0.30	Fair	496
48630	0.30	Fair	536
40597	0.34	Fair	497
46961	0.37	Fair	527

(diamonds >> 
select(starts_with('c')) >>
 head(2))

	carat	cut	color	clarity
0	0.23	Ideal	E	SI2
1	0.21	Premium	E	SI1

diamonds >> select(~starts_with('c')) >> head(2)

	depth	table	price	x	y	z
0	61.5	55.0	326	3.95	3.98	2.43
1	59.8	61.0	326	3.89	3.84	2.31

(diamonds >> 
     select(contains('c')) >>
      head(2))

	carat	cut	color	clarity	price
0	0.23	Ideal	E	SI2	326
1	0.21	Premium	E	SI1	326

diamonds >>head(2)

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31

diamonds >> select(columns_to(X.price)) >> head(2)#到某一行之前

	carat	cut	color	clarity	depth	table
0	0.23	Ideal	E	SI2	61.5	55.0
1	0.21	Premium	E	SI1	59.8	61.0

diamonds >> select(columns_from(X.table)) >> head(2)

	table	price	x	y	z
0	55.0	326	3.95	3.98	2.43
1	61.0	326	3.89	3.84	2.31

diamonds >> select(columns_between(X.carat,X.table)) >> head(2)

	carat	cut	color	clarity	depth	table
0	0.23	Ideal	E	SI2	61.5	55.0
1	0.21	Premium	E	SI1	59.8	61.0

diamonds >> row_slice([10,15])#row_slice（）函数选择切片行

	carat	cut	color	clarity	depth	table	price	x	y	z
10	0.30	Good	J	SI1	64.0	55.0	339	4.25	4.28	2.73
15	0.32	Premium	E	I1	60.9	58.0	345	4.38	4.42	2.68

diamonds >> sample(frac=0.0001, replace=False)

	carat	cut	color	clarity	depth	table	price	x	y	z
11849	1.04	Premium	H	VVS1	60.4	58.0	5102	6.58	6.53	3.96
22288	1.66	Ideal	F	SI2	62.6	59.0	10338	7.53	7.57	4.73
13016	1.20	Very Good	I	SI1	62.2	55.0	5408	6.75	6.92	4.25
2710	0.31	Ideal	G	SI1	61.6	55.0	563	4.34	4.37	2.68
25448	1.50	Premium	F	VS2	61.1	58.0	14199	7.36	7.33	4.49

diamonds >> sample(n=3, replace=True)

	carat	cut	color	clarity	depth	table	price	x	y	z
13775	1.70	Fair	D	I1	64.7	56.0	5617	7.46	7.37	4.80
6043	0.32	Ideal	E	SI2	62.0	55.0	576	4.41	4.36	2.72
33208	0.33	Ideal	H	IF	61.6	55.0	820	4.44	4.47	2.74

diamonds >> distinct(X.color)

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
3	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75
7	0.26	Very Good	H	SI1	61.9	55.0	337	4.07	4.11	2.53
12	0.22	Premium	F	SI1	60.4	61.0	342	3.88	3.84	2.33
25	0.23	Very Good	G	VVS2	60.4	58.0	354	3.97	4.01	2.41
28	0.23	Very Good	D	VS2	60.5	61.0	357	3.96	3.97	2.40

(diamonds
 >> filter_by(X.cut == 'Ideal', X.color == 'E', X.table < 55, X.price < 500)
 >> pull('carat'))

c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\subset.py:87: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  return df.ix[:, column]





26683    0.33
32297    0.34
40928    0.30
50623    0.30
50625    0.30
Name: carat, dtype: float64

diamonds >> mutate(x_plus_y=X.x + X.y) >> select(columns_from('x')) >> head(3)

	x	y	z	x_plus_y
0	3.95	3.98	2.43	7.93
1	3.89	3.84	2.31	7.73
2	4.05	4.07	2.31	8.12

diamonds >> mutate(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> select(columns_from('x')) >> head(3)

	x	y	z	x_plus_y	y_div_z
0	3.95	3.98	2.43	7.93	1.637860
1	3.89	3.84	2.31	7.73	1.662338
2	4.05	4.07	2.31	8.12	1.761905

diamonds >> transmute(x_plus_y=X.x + X.y, y_div_z=(X.y / X.z)) >> head(3)

	x_plus_y	y_div_z
0	7.93	1.637860
1	7.73	1.662338
2	8.12	1.761905

diamonds >> arrange(X.table, ascending=False) >> head(5)

	carat	cut	color	clarity	depth	table	price	x	y	z
24932	2.01	Fair	F	SI1	58.6	95.0	13387	8.32	8.31	4.87
50773	0.81	Fair	F	SI2	68.8	79.0	2301	5.26	5.20	3.58
51342	0.79	Fair	G	SI1	65.3	76.0	2362	5.52	5.13	3.35
52860	0.50	Fair	E	VS2	79.0	73.0	2579	5.21	5.18	4.09
49375	0.70	Fair	H	VS1	62.0	73.0	2100	5.65	5.54	3.47

(diamonds >> group_by(X.cut) >> arrange(X.price) >>
 head(3) >> ungroup() >> mask(X.carat < 0.23))

	carat	cut	color	clarity	depth	table	price	x	y	z
8	0.22	Fair	E	VS2	65.1	61.0	337	3.87	3.78	2.49
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
12	0.22	Premium	F	SI1	60.4	61.0	342	3.88	3.84	2.33

diamonds >> rename(CUT=X.cut, COLOR='color') >> head(2)

	carat	CUT	COLOR	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31

diamonds >> gather('variable', 'value', ['price', 'depth','x','y','z']) >> head(5)
#gather（）函数将DataFrame中的指定列融合为两个键 ：variable和value。

	carat	cut	color	clarity	table	variable	value
0	0.23	Ideal	E	SI2	55.0	price	326.0
1	0.21	Premium	E	SI1	61.0	price	326.0
2	0.23	Good	E	VS1	65.0	price	327.0
3	0.29	Premium	I	VS2	58.0	price	334.0
4	0.31	Good	J	SI2	58.0	price	335.0

diamonds >> gather('variable', 'value') >> head(5)

	variable	value
0	carat	0.23
1	carat	0.21
2	carat	0.23
3	carat	0.29
4	carat	0.31

,,,
print(d)

a  b      c
0  1  a   True
1  2  b  False
2  3  c    NaN

d >> unite('united', X.a, 'b', 2, remove=False, na_action='maintain')

   a  b      c     united
0  1  a   True   1_a_True
1  2  b  False  2_b_False
2  3  c    NaN        NaN

d >> unite('united', ['a','b','c'], remove=True, na_action='ignore', sep='*')

      united
0   1*a*True
1  2*b*False
2        3*c

d >> unite('united', d.columns, remove=True, na_action='as_string')

      united
0   1_a_True
1  2_b_False
2    3_c_nan
''''''

  File "<ipython-input-84-a4809ea807f5>", line 4
    a  b      c
       ^
SyntaxError: invalid syntax

a = pd.DataFrame({
        'x1':['A','B','C'],
        'x2':[1,2,3]
    })
b = pd.DataFrame({
    'x1':['A','B','D'],
    'x3':[True,False,True]
})

	x1	x2
0	A	1
1	B	2
2	C	3

	x1	x3
0	A	True
1	B	False
2	D	True

a >> inner_join(b, by='x1')

	x1	x2	x3
0	A	1	True
1	B	2	False

a >> outer_join(b, by='x1')

	x1	x2	x3
0	A	1.0	True
1	B	2.0	False
2	C	3.0	NaN
3	D	NaN	True

a >> left_join(b, by='x1')

	x1	x2	x3
0	A	1	True
1	B	2	False
2	C	3	NaN

a >> right_join(b, by='x1')

	x1	x2	x3
0	A	1.0	True
1	B	2.0	False
2	D	NaN	True

a >> semi_join(b, by='x1')
#returns all of the rows in the left DataFrame that have a match in the right DataFrame in the by columns.

	x1	x2
0	A	1
1	B	2

a >> anti_join(b, by='x1')

	x1	x2
2	C	3

a >> bind_rows(b, join='inner')

	x1
0	A
1	B
2	C
0	A
1	B
2	D

a >> bind_rows(b, join='outer')

c:\users\administer\appdata\local\programs\python\python36-32\lib\site-packages\dfply\join.py:279: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  df = pd.concat([df, other], join=join, ignore_index=ignore_index, axis=0)

	x1	x2	x3
0	A	1.0	NaN
1	B	2.0	NaN
2	C	3.0	NaN
0	A	NaN	True
1	B	NaN	False
2	D	NaN	True

a >> bind_cols(b)

	x1	x2	x1	x3
0	A	1	A	True
1	B	2	B	False
2	C	3	D	True

diamonds >> summarize(price_mean=X.price.mean(), price_std=X.price.std())

	price_mean	price_std
0	3932.799722	3989.439738

diamonds >> group_by('cut') >> summarize(price_mean=X.price.mean(), price_std=X.price.std())
#summarize() 可以很好的搭配group_by函数

	cut	price_mean	price_std
0	Fair	4358.757764	3560.386612
1	Good	3928.864452	3681.589584
2	Ideal	3457.541970	3808.401172
3	Premium	4584.257704	4349.204961
4	Very Good	3981.759891	3935.862161

#summarize_each(function_list, *columns)更常用，它可以比较方便的计算数据框中的列的一些属性。
diamonds >> summarize_each([np.mean, np.var], X.price, 'depth')

	price_mean	price_var	depth_mean	depth_var
0	3932.799722	1.591533e+07	61.749405	2.052366

diamonds >> group_by(X.cut) >> summarize_each([np.mean, np.var], X.price, 4)

	cut	price_mean	price_var	depth_mean	depth_var
0	Fair	4358.757764	1.266848e+07	64.041677	13.266319
1	Good	3928.864452	1.355134e+07	62.365879	4.705224
2	Ideal	3457.541970	1.450325e+07	61.709401	0.516274
3	Premium	4584.257704	1.891421e+07	61.264673	1.342755
4	Very Good	3981.759891	1.548973e+07	61.818275	1.900466

(diamonds >> mutate(price_lead=lead(X.price, 2), price_lag=lag(X.price, 2)) >>
            select(X.price, -2, -1) >>
            head(6))

#lead（series，n）函数向上推动向量中的值，在末尾位置添加NaN值。 同样，lag函数向下推动值，在初始位置插入NaN值。

	price	price_lead	price_lag
0	326	327.0	NaN
1	326	334.0	NaN
2	327	335.0	326.0
3	334	336.0	326.0
4	335	336.0	327.0
5	336	337.0	334.0

diamonds >> select(X.price) >> mutate(price_btwn=between(X.price, 330, 340)) >> head(6)

	price	price_btwn
0	326	False
1	326	False
2	327	False
3	334	True
4	335	True
5	336	True

#dense_rank（） 函数
#计算重复的排序值或者秩
diamonds >> select(X.price) >> mutate(price_drank=dense_rank(X.price)) >> head(6)

	price	price_drank
0	326	1.0
1	326	1.0
2	327	2.0
3	334	3.0
4	335	4.0
5	336	5.0

diamonds >> select(X.price) >> mutate(price_mrank=min_rank(X.price)) >> head(6)

	price	price_mrank
0	326	1.0
1	326	1.0
2	327	3.0
3	334	4.0
4	335	5.0
5	336	6.0

diamonds >> select(X.price) >> mutate(price_cumsum=cumsum(X.price)) >> head(6)

	price	price_cumsum
0	326	326
1	326	652
2	327	979
3	334	1313
4	335	1648
5	336	1984

diamonds >> select(X.price) >> mutate(price_cummean=cummean(X.price)) >> head(6)
#cummean() 函数计算列的累积平均值。

	price	price_cummean
0	326	326.000000
1	326	326.000000
2	327	326.333333
3	334	328.250000
4	335	329.600000
5	336	330.666667

diamonds >> select(X.price) >> mutate(price_cummax=cummax(X.price)) >> head(6)

	price	price_cummax
0	326	326.0
1	326	326.0
2	327	327.0
3	334	334.0
4	335	335.0
5	336	336.0

diamonds >> select(X.price) >> mutate(price_cummin=cummin(X.price)) >> head(6)

	price	price_cummin
0	326	326.0
1	326	326.0
2	327	326.0
3	334	326.0
4	335	326.0
5	336	326.0

qq_39306047

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
dfply

from dfply import data diamonds = data.diamonds import pandas as pdprint(diamonds.loc[:,['cut','price','x']].head(3)) print('------------------------------------------') print(diamonds[['cut...
复制链接

扫一扫

专栏目录