import numpy as np
import pandas as pd
df = pd. read_csv( r'D:\study\pandas\data\table.csv' )
df. head( )
School Class ID Gender Address Height Weight Math Physics 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 2 S_1 C_1 1103 M street_2 186 82 87.2 B+ 3 S_1 C_1 1104 F street_2 167 81 80.4 B- 4 S_1 C_1 1105 F street_4 159 64 84.8 B+
一、透视表
df. pivot( index= 'ID' , columns= 'Gender' , values= 'Height' ) . head( )
Gender F M ID 1101 NaN 173.0 1102 192.0 NaN 1103 NaN 186.0 1104 167.0 NaN 1105 159.0 NaN
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-bbaf656d9dae> in <module>
1 #pivot函数具有很强的局限性,除了功能上较少之外,还不允许values中出现重复的行列索引对(pair)
2 #例如下面的语句会报错
----> 3 df.pivot(index='School',columns='Gender',values='Height').head()
D:\study\anaconda\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5917 from pandas.core.reshape.pivot import pivot
5918
-> 5919 return pivot(self, index=index, columns=columns, values=values)
5920
5921 _shared_docs[
D:\study\anaconda\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
428 else:
429 indexed = data._constructor_sliced(data[values].values, index=index)
--> 430 return indexed.unstack(columns)
431
432
D:\study\anaconda\lib\site-packages\pandas\core\series.py in unstack(self, level, fill_value)
3743 from pandas.core.reshape.reshape import unstack
3744
-> 3745 return unstack(self, level, fill_value)
3746
3747 # ----------------------------------------------------------------------
D:\study\anaconda\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
421 level=level,
422 fill_value=fill_value,
--> 423 constructor=obj._constructor_expanddim,
424 )
425 return unstacker.get_result()
D:\study\anaconda\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, values, index, level, value_columns, fill_value, constructor)
140
141 self._make_sorted_values_labels()
--> 142 self._make_selectors()
143
144 def _make_sorted_values_labels(self):
D:\study\anaconda\lib\site-packages\pandas\core\reshape\reshape.py in _make_selectors(self)
178
179 if mask.sum() < len(self.index):
--> 180 raise ValueError("Index contains duplicate entries, " "cannot reshape")
181
182 self.group_index = comp_index
ValueError: Index contains duplicate entries, cannot reshape
pd. pivot_table( df, index= 'ID' , columns= 'Gender' , values= 'Height' ) . head( )
Gender F M ID 1101 NaN 173.0 1102 192.0 NaN 1103 NaN 186.0 1104 167.0 NaN 1105 159.0 NaN
% timeit df. pivot( index= 'ID' , columns= 'Gender' , values= 'Height' )
% timeit pd. pivot_table( df, index= 'ID' , columns= 'Gender' , values= 'Height' )
1.42 ms ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
6.13 ms ± 123 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
pd. pivot_table( df, index= 'School' , columns= 'Gender' , values= 'Height' , aggfunc= [ 'mean' , 'sum' ] ) . head( )
mean sum Gender F M F M School S_1 173.125000 178.714286 1385 1251 S_2 173.727273 172.000000 1911 1548
pd. pivot_table( df, index= 'School' , columns= 'Gender' , values= 'Height' , aggfunc= [ 'mean' , 'sum' ] , margins= True ) . head( )
mean sum Gender F M All F M All School S_1 173.125000 178.714286 175.733333 1385 1251 2636 S_2 173.727273 172.000000 172.950000 1911 1548 3459 All 173.473684 174.937500 174.142857 3296 2799 6095
pd. pivot_table( df, index= [ 'School' , 'Class' ] ,
columns= [ 'Gender' , 'Address' ] ,
values= [ 'Height' , 'Weight' ] )
Height ... Weight Gender F M ... F M Address street_1 street_2 street_4 street_5 street_6 street_7 street_1 street_2 street_4 street_5 ... street_4 street_5 street_6 street_7 street_1 street_2 street_4 street_5 street_6 street_7 School Class S_1 C_1 NaN 179.5 159.0 NaN NaN NaN 173.0 186.0 NaN NaN ... 64.0 NaN NaN NaN 63.0 82.0 NaN NaN NaN NaN C_2 NaN NaN 176.0 162.0 167.0 NaN NaN NaN NaN 188.0 ... 94.0 63.0 63.0 NaN NaN NaN NaN 68.0 53.0 NaN C_3 175.0 NaN NaN 187.0 NaN NaN NaN 195.0 161.0 NaN ... NaN 69.0 NaN NaN NaN 70.0 68.0 NaN NaN 82.0 S_2 C_1 NaN NaN NaN 159.0 161.0 NaN NaN NaN 163.5 NaN ... NaN 97.0 61.0 NaN NaN NaN 71.0 NaN NaN 84.0 C_2 NaN NaN NaN NaN NaN 188.5 175.0 NaN 155.0 193.0 ... NaN NaN NaN 76.5 74.0 NaN 91.0 100.0 NaN NaN C_3 NaN NaN 157.0 NaN 164.0 190.0 NaN NaN 187.0 171.0 ... 78.0 NaN 81.0 99.0 NaN NaN 73.0 88.0 NaN NaN C_4 NaN 176.0 NaN NaN 175.5 NaN NaN NaN NaN NaN ... NaN NaN 57.0 NaN NaN NaN NaN NaN NaN 82.0
7 rows × 24 columns
pd. crosstab( index= df[ 'Address' ] , columns= df[ 'Gender' ] )
Gender F M Address street_1 1 2 street_2 4 2 street_4 3 5 street_5 3 3 street_6 5 1 street_7 3 3
pd. crosstab( index= df[ 'Address' ] , columns= df[ 'Gender' ] ,
values= np. random. randint( 1 , 20 , df. shape[ 0 ] ) , aggfunc= 'min' )
Gender F M Address street_1 1 6 street_2 6 3 street_4 6 9 street_5 6 2 street_6 15 10 street_7 9 6
pd. crosstab( index= df[ 'Address' ] , columns= df[ 'Gender' ] , normalize= 'all' , margins= True )
Gender F M All Address street_1 0.028571 0.057143 0.085714 street_2 0.114286 0.057143 0.171429 street_4 0.085714 0.142857 0.228571 street_5 0.085714 0.085714 0.171429 street_6 0.142857 0.028571 0.171429 street_7 0.085714 0.085714 0.171429 All 0.542857 0.457143 1.000000
二、其他变形方法
df_m = df[ [ 'ID' , 'Gender' , 'Math' ] ]
df_m. head( )
ID Gender Math 0 1101 M 34.0 1 1102 F 32.5 2 1103 M 87.2 3 1104 F 80.4 4 1105 F 84.8
df. pivot( index= 'ID' , columns= 'Gender' , values= 'Math' ) . head( )
Gender F M ID 1101 NaN 34.0 1102 32.5 NaN 1103 NaN 87.2 1104 80.4 NaN 1105 84.8 NaN
pivoted = df. pivot( index= 'ID' , columns= 'Gender' , values= 'Math' )
result = pivoted. reset_index( ) . melt( id_vars= [ 'ID' ] , value_vars= [ 'F' , 'M' ] , value_name= 'Math' ) \
. dropna( ) . set_index( 'ID' ) . sort_index( )
result. equals( df_m. set_index( 'ID' ) )
True
df_s = pd. pivot_table( df, index= [ 'Class' , 'ID' ] , columns= 'Gender' , values= [ 'Height' , 'Weight' ] )
df_s. groupby( 'Class' ) . head( 2 )
Height Weight Gender F M F M Class ID C_1 1101 NaN 173.0 NaN 63.0 1102 192.0 NaN 73.0 NaN C_2 1201 NaN 188.0 NaN 68.0 1202 176.0 NaN 94.0 NaN C_3 1301 NaN 161.0 NaN 68.0 1302 175.0 NaN 57.0 NaN C_4 2401 192.0 NaN 62.0 NaN 2402 NaN 166.0 NaN 82.0
df_stacked = df_s. stack( )
df_stacked. groupby( 'Class' ) . head( 2 )
Height Weight Class ID Gender C_1 1101 M 173.0 63.0 1102 F 192.0 73.0 C_2 1201 M 188.0 68.0 1202 F 176.0 94.0 C_3 1301 M 161.0 68.0 1302 F 175.0 57.0 C_4 2401 F 192.0 62.0 2402 M 166.0 82.0
df_stacked = df_s. stack( 0 )
df_stacked. groupby( 'Class' ) . head( 2 )
Gender F M Class ID C_1 1101 Height NaN 173.0 Weight NaN 63.0 C_2 1201 Height NaN 188.0 Weight NaN 68.0 C_3 1301 Height NaN 161.0 Weight NaN 68.0 C_4 2401 Height 192.0 NaN Weight 62.0 NaN
df_stacked. head( )
Gender F M Class ID C_1 1101 Height NaN 173.0 Weight NaN 63.0 1102 Height 192.0 NaN Weight 73.0 NaN 1103 Height NaN 186.0
result = df_stacked. unstack( ) . swaplevel( 1 , 0 , axis= 1 ) . sort_index( axis= 1 )
result. equals( df_s)
True
三、哑变量与因子化
df_d = df[ [ 'Class' , 'Gender' , 'Weight' ] ]
df_d. head( )
Class Gender Weight 0 C_1 M 63 1 C_1 F 73 2 C_1 M 82 3 C_1 F 81 4 C_1 F 64
pd. get_dummies( df_d[ [ 'Class' , 'Gender' ] ] ) . join( df_d[ 'Weight' ] ) . head( )
Class_C_1 Class_C_2 Class_C_3 Class_C_4 Gender_F Gender_M Weight 0 1 0 0 0 0 1 63 1 1 0 0 0 1 0 73 2 1 0 0 0 0 1 82 3 1 0 0 0 1 0 81 4 1 0 0 0 1 0 64
codes, uniques = pd. factorize( [ 'b' , None , 'a' , 'c' , 'b' ] , sort= True )
display( codes)
display( uniques)
array([ 1, -1, 0, 2, 1], dtype=int64)
array(['a', 'b', 'c'], dtype=object)
问题 【问题一】 上面提到了许多变形函数,如melt/crosstab/pivot/pivot_table/stack/unstack函数,请总结它们各自的使用特点。 【问题二】 变形函数和多级索引是什么关系?哪些变形函数会使得索引维数变化?具体如何变化? 【问题三】 请举出一个除了上文提过的关于哑变量方法的例子。 【问题四】 使用完stack后立即使用unstack一定能保证变化结果与原始表完全一致吗? 【问题五】 透视表中涉及了三个函数,请分别使用它们完成相同的目标(任务自定)并比较哪个速度最快。 【问题六】 既然melt起到了stack的功能,为什么再设计stack函数?
df = pd. read_csv( r'D:\study\pandas\data\Drugs.csv' )
df. head( )
YYYY State COUNTY SubstanceName DrugReports 0 2010 VA ACCOMACK Propoxyphene 1 1 2010 OH ADAMS Morphine 9 2 2010 PA ADAMS Methadone 2 3 2010 VA ALEXANDRIA CITY Heroin 5 4 2010 PA ALLEGHENY Hydromorphone 5
result = pd. pivot_table( df, index= [ 'State' , 'COUNTY' , 'SubstanceName' ] , columns= 'YYYY'
, values= 'DrugReports' , fill_value= '-' ) . reset_index( ) . rename_axis( columns= { 'YYYY' : '' } )
result. head( )
State COUNTY SubstanceName 2010 2011 2012 2013 2014 2015 2016 2017 0 KY ADAIR Buprenorphine - 3 5 4 27 5 7 10 1 KY ADAIR Codeine - - 1 - - - - 1 2 KY ADAIR Fentanyl - - 1 - - - - - 3 KY ADAIR Heroin - - 1 2 - 1 - 2 4 KY ADAIR Hydrocodone 6 9 10 10 9 7 11 3
result_melted = result. melt( id_vars= result. columns[ : 3 ] , value_vars= result. columns[ - 8 : ]
, var_name= 'YYYY' , value_name= 'DrugReports' ) . query( 'DrugReports !="-"' )
result2 = result_melted. sort_values( by= [ 'State' , 'COUNTY' , 'YYYY' , 'SubstanceName' ] ) . \
reset_index( ) . drop( columns= 'index' )
cols = list ( result2. columns)
a, b= cols. index( 'SubstanceName' ) , cols. index( 'YYYY' )
cols[ b] , cols[ a] = cols[ a] , cols[ b]
reslut = result2[ cols] . astype( { 'DrugReports' : 'int' , 'YYYY' : 'int' } )
result2. head( )
State COUNTY SubstanceName YYYY DrugReports 0 KY ADAIR Hydrocodone 2010 6 1 KY ADAIR Methadone 2010 1 2 KY ADAIR Buprenorphine 2011 3 3 KY ADAIR Hydrocodone 2011 9 4 KY ADAIR Morphine 2011 2
df_tidy = df. reset_index( ) . sort_values( by= result2. columns[ : 4 ] . tolist( ) ) . reset_index( ) . drop( columns= 'index' )
df_tidy. head( )
level_0 YYYY State COUNTY SubstanceName DrugReports 0 2731 2011 KY ADAIR Buprenorphine 3 1 5319 2012 KY ADAIR Buprenorphine 5 2 8782 2013 KY ADAIR Buprenorphine 4 3 12163 2014 KY ADAIR Buprenorphine 27 4 13645 2015 KY ADAIR Buprenorphine 5
df_tidy. equals( result2)
False
df = pd. read_csv( r'D:\study\pandas\data\Earthquake.csv' )
df = df. sort_values( by= df. columns. tolist( ) [ : 3 ] ) . sort_index( axis= 1 ) . reset_index( ) . drop( columns= 'index' )
df. head( )
方向 日期 时间 深度 烈度 经度 维度 距离 0 south_east 1912.08.09 12:29:00 AM 16.0 6.7 27.2 40.6 4.3 1 south_west 1912.08.10 12:23:00 AM 15.0 6.0 27.1 40.6 2.0 2 south_west 1912.08.10 12:30:00 AM 15.0 5.2 27.1 40.6 2.0 3 south_east 1912.08.11 12:19:04 AM 30.0 4.9 27.2 40.6 4.3 4 south_west 1912.08.11 12:20:00 AM 15.0 4.5 27.1 40.6 2.0
result = pd. pivot_table( df, index= [ '日期' , '时间' , '维度' , '经度' ] , columns= '方向'
, values= [ '烈度' , '深度' , '距离' ] , fill_value= '-' ) \
. stack( level= 0 ) . rename_axis( index= { None : '地震参数' } )
result. head( 6 )
方向 east north north_east north_west south south_east south_west west 日期 时间 维度 经度 地震参数 1912.08.09 12:29:00 AM 40.6 27.2 深度 - - - - - 16 - - 烈度 - - - - - 6.7 - - 距离 - - - - - 4.3 - - 1912.08.10 12:23:00 AM 40.6 27.1 深度 - - - - - - 15 - 烈度 - - - - - - 6 - 距离 - - - - - - 2 -
df_result = result. unstack( ) . stack( 0 ) [ ( ~ ( result. unstack( ) . stack( 0 ) == '-' ) ) . any ( 1 ) ] . reset_index( )
df_result. columns. name= None
df_result = df_result. sort_index( axis= 1 ) . astype( { '深度' : 'float64' , '烈度' : 'float64' , '距离' : 'float64' } )
df_result. head( )
方向 日期 时间 深度 烈度 经度 维度 距离 0 south_east 1912.08.09 12:29:00 AM 16.0 6.7 27.2 40.6 4.3 1 south_west 1912.08.10 12:23:00 AM 15.0 6.0 27.1 40.6 2.0 2 south_west 1912.08.10 12:30:00 AM 15.0 5.2 27.1 40.6 2.0 3 south_east 1912.08.11 12:19:04 AM 30.0 4.9 27.2 40.6 4.3 4 south_west 1912.08.11 12:20:00 AM 15.0 4.5 27.1 40.6 2.0
df_result. astype( { '深度' : 'float64' , '烈度' : 'float64' , '距离' : 'float64' } , copy= False ) . dtypes
方向 object
日期 object
时间 object
深度 float64
烈度 float64
经度 float64
维度 float64
距离 float64
dtype: object
df. equals( df_result)
True