# 1 长宽表变形
# 01 长表变宽表pivot
# 02 不满足唯一条件,用pivot_table聚合
# 03 宽表变长表 melt
# 04 交叉类别 wide_to_long
# 2 索引的变形
# 01 unstack行→列 stack列→行
# 3 其他变形函数
# 01 crosstab
# 02 explode 对列元素纵向展开 索引不变
# 03 get_dummies 特征构建 类别特征→指示变量
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 3 10:08:43 2022
@author: lenovo
"""
import pandas as pd
import numpy as np
# 1 长宽表变形
# 长表:把性别存储在某一列中,就是关于性别的长表
pd.DataFrame({'Gender':['F','F','M','M'],'Height':[163,160,175,180]})
# 宽表:把性别作为列名,列中元素是某一其他的相关特征数值,就是关于性别的宽表
pd.DataFrame({'Height:F':[163,160],'Height:M':[175,180]})
# 01 长表变宽表pivot
# index 变形后的索引,columns 需要转到列索引的列,values 对应的数值
# index和columns对应两个列的行组合必须唯一
df = pd.DataFrame({'Class':[1,1,2,2],
'Name':['LQ','LQ','YY','YY'],
'Subject':['Chinese','Math','Chinese','Math'],
'Grade':[80,75,90,85]})
df.pivot(index='Name',
columns='Subject',
values='Grade')
df = pd.DataFrame({'Class':[1,1,2,2,1,1,2,2],
'Name':['LQ','LQ','YY','YY','LQ','LQ','YY','YY'],
'Examination':['mid','final','mid','final','mid','final','mid','final'],
'Subject':['Chinese','Chinese','Chinese','Chinese','Math','Math','Math','Math'],
'Grade':[80,75,85,65,90,85,92,88],
'rank':[10,15,21,15,20,7,6,2]})
pivot_multi = df.pivot(index=['Class','Name'],
columns=['Subject','Examination'],
values=['Grade','rank'])
# 02 不满足唯一条件,用pivot_table聚合
df = pd.DataFrame({'Name':['LQ','LQ','LQ','LQ','YY','YY','YY','YY'],
'Subject':['chinese','chinese','math','math','chinese','chinese','math','math'],
'Grade':[80,75,85,65,90,85,92,88]})
df.pivot_table(index='Name',
columns='Subject',
values='Grade',
aggfunc='mean')
df.pivot_table(index='Name',
columns='Subject',
values='Grade',
aggfunc=lambda x:x.mean())
df.pivot_table(index='Name',
columns='Subject',
values='Grade',
aggfunc='mean',
margins=True) # 边际汇总,聚合方式为aggfunc
# 03 宽表变长表 melt
df = pd.DataFrame({'Class':[1,2],
'Name':['LQ','YY'],
'Chinese':[80,90],
'Math':[80,75]})
df_melted = df.melt(id_vars=['Class','Name'],
value_vars=['Chinese','Math'],
var_name='Subject',
value_name='Grade')
df_unmelted = df_melted.pivot(index=['Class','Name'],
columns='Subject',
values='Grade')
df_unmelted = df_unmelted.reset_index().rename_axis(columns={'Subject':''})
df_unmelted.equals(df)
# 04 交叉类别 wide_to_long
df = pd.DataFrame({'Class':[1,2],'Name':['LQ','YY'],
'Chinese_mid':[80,75],'Math_mid':[90,85],
'Chinese_mid':[80,75],'Math_mid':[90,85]})
pd.wide_to_long(df,
stubnames=['Chinese','Math'], # 连字符前面的内容,成为新表数值列的列名
i=['Class','Name'], # 行索引
j='Examination', # 对连字符后面的内容重新命名,使之成为次级行索引
sep='_',
suffix='.+') # 正则后缀?
res = pivot_multi.copy()
res.columns = res.columns.map(lambda x:'_'.join(x))
res = res.reset_index()
res = pd.wide_to_long(res,
stubnames=['Grade','rank'],
i=['Class','Name'],
j='Subject_Examination',
sep='_',
suffix='.+')
res = res.reset_index()
res[['Subject','Examination']] = res['Subject_Examination'].str.split('_',expand=True)
res = res[['Class','Name','Examination','Subject','Grade','rank']].sort_values('Subject')
res = res.reset_index(drop=True)
# 2 索引的变形
# 01 unstack行→列 stack列→行
# unstack 行索引转换为列索引
# unstack 必须保证 被转为列索引的行索引层 和被保留的行索引层 构成的组合唯一
df = pd.DataFrame(np.ones((4,2)),
index=pd.Index([('A','cat','big'),
('A','dog','small'),
('B','cat','big'),
('B','dog','small')]),
columns=['col_1','col_2'])
df.unstack() # 默认转化最内层,移动到列索引的最内层
df.unstack([0,2])
# stack 将列索引压入行索引
df = pd.DataFrame(np.ones((4,2)),
index=pd.Index([('A','cat','big'),
('A','dog','small'),
('B','cat','big'),
('B','dog','small')]),
columns=['index_1','index_2']).T
df.stack()
df.stack([1,2])
# 变形不会改变values个数,只是呈现形式发生改变;分组聚合改变values个数
# 3 其他变形函数
# 01 crosstab
# 不推荐使用 pivot_table速度更快
path = r'C:\Users\lenovo\Desktop\最近要用\pandas\joyful-pandas\data'
df = pd.read_csv('{}/learn_pandas.csv'.format(path))
pd.crosstab(index=df.School, columns=df.Transfer)
pd.crosstab(index=df.School, columns=df.Transfer,
values=[1]*df.shape[0], aggfunc='count')
df.pivot_table(index='School',columns='Transfer',
values='Name',aggfunc='count')
# 以上三个结果一样,但pd.crosstab传入具体的序列,df.pivot_table传入列名
pd.crosstab(index=df.School, columns=df.Transfer,
values=df.Height, aggfunc='mean')
# 02 explode 对列元素纵向展开 索引不变
df_ex = pd.DataFrame({'A':[[1,2],'str',{1,2},pd.Series([3,4])],
'B':1})
df_ex.shape
df_ex.iloc[3,1]
df_ex.explode('A')
# 03 get_dummies 特征构建 类别特征→指示变量
pd.get_dummies(df.Grade).head()
# Ex1 美国非法药物数据集
df = pd.read_csv('{}/drugs.csv'.format(path)).sort_values(['State','COUNTY','SubstanceName'],ignore_index=True)
df.head(6)
# 1.
df1 = df.pivot(index=['State','COUNTY','SubstanceName'],
columns='YYYY',
values='DrugReports').reset_index().rename_axis(columns={'YYYY':''})
# 2.
df2 = df1.melt(id_vars=['State','COUNTY','SubstanceName'],
value_vars=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017],
var_name='YYYY',
value_name='DrugReports').dropna(subset='DrugReports')
# value_vars=df1.columns[-8:]
df3 = df2[df.columns].sort_values(['State','COUNTY','SubstanceName'],ignore_index=True) # 重置索引为0123...
df4 = df3.astype({'YYYY':'int64','DrugReports':'int64'})
df4.equals(df)
#dropna(axis = 0, how = ‘any’, thresh = None, subset = None, inplace = False)
#axis 默认为零,当某行出现缺失值时,将行删除。
#how = ‘any’,只要有缺失值就丢弃该行/列;how = ‘all’,全部为缺失值才丢弃该行/列。
#thresh 当行列中非缺失值少于给定值时,丢弃该行/列。
#subset 丢弃子列中含有缺失值的行/列。
#inplace 默认False,为True时对原数据操作。
#对于Series,dropna返回一个仅含索引值和非空数据的Series,索引值仍保留删除之前的值。
# 3.1
df.pivot_table(index='YYYY',
columns='State',
values='DrugReports',
aggfunc='sum')
gb = df.groupby(['YYYY','State'])['DrugReports'].sum()
gb.unstack(1)
# res = df.groupby(['State', 'YYYY'])['DrugReports'].sum(1).to_frame().unstack(0).droplevel(0,axis=1)
# Ex2 特殊的 wide_to_long 方法
# 从功能上看,melt()方法应当属于wide_to_long()的一种特殊情况,即stubnames只有一类。
# 请使用wide_to_long()生成melt()一节中的df_melted。(提示:对列名增加适当的前缀)
df = pd.DataFrame({'Class':[1,2],'Name':['LQ','YY'],
'Chinese':[80,90],'Math':[80,75]})
df_melted = df.melt(id_vars=['Class','Name'],
value_vars=['Chinese','Math'],
var_name='Subject',
value_name='Grade')
df1 = df.copy()
df1.rename(columns={'Chinese':'Score_Chinese','Math':'Score_Math'},inplace=True)
df2 = pd.wide_to_long(df1,
stubnames=['Score'], # 连字符前面的内容,成为新表数值列的列名
i=['Class','Name'], # 行索引
j='Subject', # 对连字符后面的内容重新命名,使之成为次级行索引
sep='_',
suffix='.+')
df3 = df2.reset_index()