import numpy as np
import pandas as pd
一、长宽表的变形
长表可以理解为某个特征单独作为列索引;而宽表可以以分组的思想去理解,值是其他特征的,但是这个值还包含着某个特征的影响在里面。
1.pivot
df = pd. DataFrame( { 'Class' : [ 1 , 1 , 2 , 2 ] ,
'Name' : [ 'San Zhang' , 'San Zhang' , 'Si Li' , 'Si Li' ] ,
'Subject' : [ 'Chinese' , 'Math' , 'Chinese' , 'Math' ] ,
'Grade' : [ 80 , 75 , 90 , 85 ] } )
df
Class Name Subject Grade 0 1 San Zhang Chinese 80 1 1 San Zhang Math 75 2 2 Si Li Chinese 90 3 2 Si Li Math 85
目标是将语文和数学分数作为列来展示。其实也就是将subject列的值作为索引然后来将grade进行对应填值。对于长变宽,要有变形后的行索引、需要转到列索引的列,以及行列索引对应的某个特征(Grade)的值。
df. pivot( index= 'Name' , columns= 'Subject' , values= 'Grade' )
Subject Chinese Math Name San Zhang 80 75 Si Li 90 85
df. pivot( index= 'Name' , columns= 'Subject' , values= 'Grade' )
Subject Chinese Math Name San Zhang 80 75 Si Li 90 85
df = pd. DataFrame( { 'Class' : [ 1 , 1 , 2 , 2 , 1 , 1 , 2 , 2 ] ,
'Name' : [ 'San Zhang' , 'San Zhang' , 'Si Li' , 'Si Li' ,
'San Zhang' , 'San Zhang' , 'Si Li' , 'Si Li' ] ,
'Examination' : [ 'Mid' , 'Final' , 'Mid' , 'Final' ,
'Mid' , 'Final' , 'Mid' , 'Final' ] ,
'Subject' : [ 'Chinese' , 'Chinese' , 'Chinese' , 'Chinese' ,
'Math' , 'Math' , 'Math' , 'Math' ] ,
'Grade' : [ 80 , 75 , 85 , 65 , 90 , 85 , 92 , 88 ] ,
'rank' : [ 10 , 15 , 21 , 15 , 20 , 7 , 6 , 2 ] } )
pivot_multi = df. pivot( index = [ 'Class' , 'Name' ] ,
columns = [ 'Subject' , 'Examination' ] ,
values = [ 'Grade' , 'rank' ] )
pivot_multi
Grade rank Subject Chinese Math Chinese Math Examination Mid Final Mid Final Mid Final Mid Final Class Name 1 San Zhang 80 75 90 85 10 15 20 7 2 Si Li 85 65 92 88 21 15 6 2
2. pivot_table
pivot_table的使用是用于解决pivot只能对唯一性做变形。这个可以通过聚合将行列组合的多个值变为一个值。
df = pd. DataFrame( { 'Name' : [ 'San Zhang' , 'San Zhang' ,
'San Zhang' , 'San Zhang' ,
'Si Li' , 'Si Li' , 'Si Li' , 'Si Li' ] ,
'Subject' : [ 'Chinese' , 'Chinese' , 'Math' , 'Math' ,
'Chinese' , 'Chinese' , 'Math' , 'Math' ] ,
'Grade' : [ 80 , 90 , 100 , 90 , 70 , 80 , 85 , 95 ] } )
df
Name Subject Grade 0 San Zhang Chinese 80 1 San Zhang Chinese 90 2 San Zhang Math 100 3 San Zhang Math 90 4 Si Li Chinese 70 5 Si Li Chinese 80 6 Si Li Math 85 7 Si Li Math 95
df. pivot_table( index = 'Name' ,
columns= 'Subject' ,
values = 'Grade' ,
aggfunc = 'mean' )
Subject Chinese Math Name San Zhang 85 95 Si Li 75 90
pivot_table还具有边际汇总,利用margins=True来实现。也就是将每一行再执行一次aggfunc
df. pivot_table( index = 'Name' ,
columns= 'Subject' ,
values = 'Grade' ,
aggfunc = 'median' ,
margins = True )
Subject Chinese Math All Name San Zhang 85 95.0 90.0 Si Li 75 90.0 82.5 All 80 92.5 87.5
总体的汇总为新表中四个元素的平均值.如果是sum或者median,那么sum或者median的是原表的四个值
3.melt
melt可以理解为pivot的相应的逆操作,把宽表转为长表。
df = pd. DataFrame( { 'Class' : [ 1 , 2 ] ,
'Name' : [ 'San Zhang' , 'Si Li' ] ,
'Chinese' : [ 80 , 90 ] ,
'Math' : [ 80 , 75 ] } )
df_melted = df. melt( id_vars= [ 'Class' , 'Name' ] ,
value_vars= [ 'Chinese' , 'Math' ] ,
var_name= 'Subject' ,
value_name= 'Grade' )
df_melted
Class Name Subject Grade 0 1 San Zhang Chinese 80 1 2 Si Li Chinese 90 2 1 San Zhang Math 80 3 2 Si Li Math 75
df_unmelted = df_melted. pivot( index = [ 'Class' , 'Name' ] ,
columns= 'Subject' ,
values= 'Grade' )
df_unmelted
Subject Chinese Math Class Name 1 San Zhang 80 80 2 Si Li 90 75
df_unmelted. reset_index( ) . rename_axis( columns= { 'Subject' : '' } )
Class Name Chinese Math 0 1 San Zhang 80 80 1 2 Si Li 90 75
4.wide_to_long
二、索引的变形
1.stack与unstack
将行列索引进行交换,维度会发生变化,这就属于变形。
df = pd. DataFrame( np. ones( ( 4 , 2 ) ) ,
index = pd. Index( [ ( 'A' , 'cat' , 'big' ) ,
( 'A' , 'dog' , 'small' ) ,
( 'B' , 'cat' , 'big' ) ,
( 'B' , 'dog' , 'small' ) ] ) ,
columns= [ 'col_1' , 'col_2' ] )
df. unstack( )
col_1 col_2 big small big small A cat 1.0 NaN 1.0 NaN dog NaN 1.0 NaN 1.0 B cat 1.0 NaN 1.0 NaN dog NaN 1.0 NaN 1.0
df. unstack( [ 0 , 2 ] )
col_1 col_2 A B A B big small big small big small big small cat 1.0 NaN 1.0 NaN 1.0 NaN 1.0 NaN dog NaN 1.0 NaN 1.0 NaN 1.0 NaN 1.0
三、其他变形函数
1.crosstab
对于crosstab,感觉还是用pivot_table方便。
df = pd. read_csv( 'data/learn_pandas.csv' )
df. pivot_table( index = 'School' ,
columns = 'Transfer' ,
values = 'Name' ,
aggfunc = 'count' )
Transfer N Y School Fudan University 38.0 1.0 Peking University 28.0 2.0 Shanghai Jiao Tong University 53.0 NaN Tsinghua University 62.0 4.0
2.explode
df_ex = pd. DataFrame( { 'A' : [ [ 1 , 2 ] ,
'my_str' ,
{ 1 , 2 } ,
pd. Series( [ 3 , 4 ] ) ] ,
'B' : 1 } )
df_ex
A B 0 [1, 2] 1 1 my_str 1 2 {1, 2} 1 3 0 3 1 4 dtype: int64 1
df_ex. explode( 'A' )
A B 0 1 1 0 2 1 1 my_str 1 2 {1, 2} 1 3 3 1 3 4 1
pd. get_dummies( df. Grade) . head( )
Freshman Junior Senior Sophomore 0 1 0 0 0 1 1 0 0 0 2 0 0 1 0 3 0 0 0 1 4 0 0 0 1
四、练习
1.EX1:美国非法药物数据集
df = pd. read_csv( 'data/drugs.csv' ) . sort_values( [ 'State' , 'COUNTY' , 'SubstanceName' ] , ignore_index= True )
df. head( )
YYYY State COUNTY SubstanceName DrugReports 0 2011 KY ADAIR Buprenorphine 3 1 2012 KY ADAIR Buprenorphine 5 2 2013 KY ADAIR Buprenorphine 4 3 2014 KY ADAIR Buprenorphine 27 4 2015 KY ADAIR Buprenorphine 5
res = df. pivot( index= [ 'State' , 'COUNTY' , 'SubstanceName' ] ,
columns= 'YYYY' ,
values= 'DrugReports' )
res = res. reset_index( ) . rename_axis( columns= { 'YYYY' : '' } )
res. head( )
转换数据格式这里直接想到前面的pivot方法。(虽然想到了,但是应用起来还是不熟练,不得不返回去再看一遍)。而且还包含着要去对索引层名字的修改与重置。
State COUNTY SubstanceName 2010 2011 2012 2013 2014 2015 2016 2017 0 KY ADAIR Buprenorphine NaN 3.0 5.0 4.0 27.0 5.0 7.0 10.0 1 KY ADAIR Codeine NaN NaN 1.0 NaN NaN NaN NaN 1.0 2 KY ADAIR Fentanyl NaN NaN 1.0 NaN NaN NaN NaN NaN 3 KY ADAIR Heroin NaN NaN 1.0 2.0 NaN 1.0 NaN 2.0 4 KY ADAIR Hydrocodone 6.0 9.0 10.0 10.0 9.0 7.0 11.0 3.0
res_melted = res. melt( id_vars = [ 'State' , 'COUNTY' , 'SubstanceName' ] ,
value_vars = res. columns[ - 8 : ] ,
var_name = 'YYYY' ,
value_name = 'DrugReports' ) . dropna(
subset= [ 'DrugReports' ] )
res_melted. head( )
State COUNTY SubstanceName YYYY DrugReports 4 KY ADAIR Hydrocodone 2010 6.0 6 KY ADAIR Methadone 2010 1.0 13 KY ALLEN Hydrocodone 2010 10.0 15 KY ALLEN Methadone 2010 4.0 17 KY ALLEN Oxycodone 2010 15.0
res_melted = res_melted[ df. columns] . sort_values( [
'State' , 'COUNTY' , 'SubstanceName' ] , ignore_index= True
) . astype( { 'YYYY' : 'int64' , 'DrugReports' : 'int64' } )
res_melted. head( )
YYYY State COUNTY SubstanceName DrugReports 0 2011 KY ADAIR Buprenorphine 3 1 2012 KY ADAIR Buprenorphine 5 2 2013 KY ADAIR Buprenorphine 4 3 2014 KY ADAIR Buprenorphine 27 4 2015 KY ADAIR Buprenorphine 5
恢复的过程也就是对melt方法的应用。(这里对melt的参数运用还没有理解到位,于是就只能照猫画虎将实例的参数设置对应着搬过来用)。因为第1问的DrugReports格式被改为了float64,所以答案用astype方法将类型重置为int64.