import numpy as np
import pandas as pd
Ex1:汽车数据集
现有一份汽车数据集,其中Brand, Disp., HP
分别代表汽车品牌、发动机蓄量、发动机输出。
data = [ [ 'Eagle Summit 4' , 8895 , 'USA' , 4.0 , 33 , 'Small' , 2560 , 97 , 113 ] , [ 'Ford Escort 4' , 7402 , 'USA' , 2.0 , 33 , 'Small' , 2345 , 114 , 90 ] , [ 'Ford Festiva 4' , 6319 , 'Korea' , 4.0 , 37 , 'Small' , 1845 , 81 , 63 ] ]
df = pd. DataFrame( data = data, index = np. arange( 3 ) , columns= [ 'Brand' , 'Price' , 'Country' , 'Reliability' , 'Mileage' , 'Type' , 'Weight' , 'Disp.' , 'HP' ] )
df. to_csv( './/car.csv' )
1.先过滤出所属Country数超过2个的汽车,即若该汽车的Country在总体数据集中出现次数不超过2则剔除,再按Country分组计算价格均值、价格变异系数、该Country的汽车数量,其中变异系数的计算方法是标准差除以均值,并在结果中把变异系数重命名为CoV。
gb = df. groupby( [ 'Country' ] )
gbMoreT = gb. filter ( lambda x: x. shape[ 0 ] >= 1 )
gb = gb[ 'Price' ]
gb. mean( )
gb. count( )
gb. std( ) / gb. mean( )
Country
Korea NaN
USA 0.129559
Name: Price, dtype: float64
gb. agg( [ ( 'CoV' , lambda x : x. std( ) / x. mean( ) ) ] )
CoV Country Korea NaN USA 0.129559
gb = df. groupby( [ 'Country' ] )
print ( type ( gb) )
gb = gb. filter ( lambda x: x. shape[ 0 ] >= 1 )
print ( type ( gb) )
gb = gb[ 'Price' ]
print ( type ( gb) )
gb. agg( [ ( 'CoV' , lambda x : x. std( ) / x. mean( ) ) , 'mean' , 'count' ] )
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
AttributeError: 'CoV' is not a valid function for 'Series' object
gb = df. groupby( [ 'Country' ] )
print ( type ( gb) )
gb = gb[ 'Price' ]
print ( type ( gb) )
gb. agg( [ ( 'CoV' , lambda x : x. std( ) / x. mean( ) ) , 'mean' , 'count' ] )
pandas.core.groupby.generic.DataFrameGroupBy
pandas.core.groupby.generic.SeriesGroupBy
CoV mean count Country Korea NaN 6319.0 1 USA 0.129559 8148.5 2
== 注意 == 经过gb.filter
后,变量的type()是会改变的。
df. groupby( 'Country' ) . filter ( lambda x: x. shape[ 0 ] >= 1 ) . groupby( 'Country' ) [ 'Price' ] . agg( [ ( 'CoV' , lambda x: x. std( ) / x. mean( ) ) , 'mean' , 'count' ] )
CoV mean count Country Korea NaN 6319.0 1 USA 0.129559 8148.5 2
2. 按照表中位置的前三分之一、中间三分之一和后三分之一分组,统计Price的均值。
df
Brand Price Country Reliability Mileage Type Weight Disp. HP 0 Eagle Summit 4 8895 USA 4.0 33 Small 2560 97 113 1 Ford Escort 4 7402 USA 2.0 33 Small 2345 114 90 2 Ford Festiva 4 6319 Korea 4.0 37 Small 1845 81 63
condition = df. index> 0
condition1 = df. index > len ( df) / 3
condition2 = df. index > len ( df) / 3 * 2
df. groupby( [ condition, condition1, condition2] ) [ 'Price' ] . mean( )
False False False 8895
True False False 7402
True False 6319
Name: Price, dtype: int64
df = pd. read_csv( './/car.csv' )
condition = [ 'Head' ] * 1 + [ 'Mid' ] * 1 + [ 'Tail' ] * 1
df. groupby( condition) [ 'Price' ] . mean( )
Head 8895
Mid 7402
Tail 6319
Name: Price, dtype: int64
3.对类型Type分组,对Price和HP分别计算最大值和最小值,结果会产生多级索引,请用下划线把多级列索引合并为单层索引。
gb = df. groupby( 'Type' ) [ 'Price' , 'HP' ]
gb = gb. agg( [ 'max' , 'min' ] )
gbcol = gb. columns. map ( lambda x: x[ 0 ] + '_' + x[ 1 ] )
gb. columns = gbcol
gb
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
This is separate from the ipykernel package so we can avoid doing imports until
Price_max Price_min HP_max HP_min Type Small 8895 6319 113 63
res = df. groupby( 'Type' ) . agg( { 'Price' : [ 'max' ] , 'HP' : [ 'min' ] } )
res. columns = res. columns. map ( lambda x: '_' . join( x) )
res
Price_max HP_min Type Small 8895 63
gb = df.groupby('Type')['Price','HP']
给我警告的原因 gb = df.groupby('Type')]['Price','HP']]
gb = df. groupby( 'Type' ) [ [ 'Price' , 'HP' ] ]
gb = gb. agg( [ 'max' , 'min' ] )
gbcol = gb. columns. map ( lambda x: x[ 0 ] + '_' + x[ 1 ] )
gb. columns = gbcol
gb
Price_max Price_min HP_max HP_min Type Small 8895 6319 113 63
4.对类型Type分组,对HP进行组内的min-max归一化。
gb = df. groupby( 'Type' ) [ 'HP' ]
gb = gb. agg( lambda x : ( x- x. min ( ) ) / ( x. max ( ) - x. min ( ) ) )
gb
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
ValueError: Must produce aggregated value
因为是要对每一列进行操作,所以使用agg
会报错
gb = df. groupby( 'Type' ) [ 'HP' ]
gb = gb. transform( lambda x : ( x- x. min ( ) ) / ( x. max ( ) - x. min ( ) ) )
gb
def normalize ( s) :
s_min, s_max = s. min ( ) , s. max ( )
res = ( s - s_min) / ( s_max - s_min)
return res
df. groupby( 'Type' ) [ 'HP' ] . transform( normalize) . head( )
0 1.00
1 0.54
2 0.00
Name: HP, dtype: float64
5.对类型Type分组,计算Disp.与HP的相关系数。
gb = df. groupby( 'Type' )
def Correlation ( s) :
cov = np. cov( s[ 'Disp.' ] , s[ 'HP' ] ) [ 0 , 1 ]
std1 = s[ 'Disp.' ] . std( )
std2 = s[ 'HP' ] . std( )
return cov/ ( std1* std2)
gb. apply ( Correlation)
Type
Small 0.524613
dtype: float64
df. groupby( 'Type' ) [ [ 'HP' , 'Disp.' ] ] . apply ( lambda x: np. corrcoef( x[ 'HP' ] . values, x[ 'Disp.' ] . values) [ 0 , 1 ] )
Type
Small 0.524613
dtype: float64
协方差 ;np.cov 相关系数 : np.corrcoef
r
(
X
,
Y
)
=
C
o
v
(
X
,
Y
)
V
a
r
(
X
)
,
V
a
r
(
Y
)
r(X,Y)=\frac{Cov(X,Y)}{\sqrt[]{Var(X),Var(Y)}}
r ( X , Y ) = V a r ( X ) , V a r ( Y )
C o v ( X , Y )
Ex2:实现transform函数
groupby对象的构造方法是my_groupby(df, group_cols) 支持单列分组与多列分组 支持带有标量广播的my_groupby(df)[col].transform(my_func)功能 pandas的transform不能跨列计算,请支持此功能,即仍返回Series但col参数为多列 无需考虑性能与异常处理,只需实现上述功能,在给出测试样例的同时与pandas中的transform对比结果是否一致
def f ( s) :
res = s. min ( )
return res
class my_groupby :
'`my_df`是要分组的数据源,`group_cols`是分组的依据'
def __init__ ( self, my_df, group_cols) :
self. my_df = my_df. copy( )
self. groups = my_df[ group_cols] . drop_duplicates( )
if isinstance ( self. groups, pd. Series) :
self. groups = self. groups. to_frame( )
self. group_cols = self. groups. columns. tolist( )
self. groups = { i: self. groups[ i] . values. tolist( ) for i in self. groups. columns}
self. transform_col = None
def __getitem__ ( self, col) :
self. pr_col = [ col] if isinstance ( col, str ) else list ( col)
return self
def transform ( self, my_func) :
self. num = len ( self. groups[ self. group_cols[ 0 ] ] )
L_order, L_value = np. array( [ ] ) , np. array( [ ] )
for i in range ( self. num) :
group_df = self. my_df. reset_index( ) . copy( )
for col in self. group_cols:
group_df = group_df[ group_df[ col] == self. groups[ col] [ i] ]
group_df = group_df[ self. pr_col]
if group_df. shape[ 1 ] == 1 :
group_df = group_df. iloc[ : , 0 ]
group_res = my_func( group_df)
print ( type ( group_res) )
if not isinstance ( group_res, pd. Series) :
group_res = pd. Series( group_res, index= group_df. index, name= group_df. name)
L_order = np. r_[ L_order, group_res. index]
L_value = np. r_[ L_value, group_res. values]
self. res = pd. Series( pd. Series( L_value, index= L_order) . sort_index( ) . values, index= self. my_df. reset_index( ) . index, name= my_func. __name__)
return self. res