数据拆分(整个文件级别的操作)
标记数据拆分组
import pandas as pd
df1 = pd.read_csv('123.csv',encoding = 'utf-8')
‘’’
df.groupby(
by:用于分组的变量名/函数
axis = 0
level = None :相应的轴存在多层索引时,指定用于分组的级别
as_index = True :在结果中将组标签作为索引
sort = True :结果是否按照分组关键词进行排序
)# 生成的是分组标引标记,不是DF,可以映射到原DF 上进行相关操作
数据分组之后,许多数据处理/分析/绘图命令都可以在各组间单独执行
‘’’
df1_group1 = df1.groupby('评分')
print(df1_group1)#本质上是一个字典结构
print(df1_group1.groups)
print(df1_group1.describe())
df1_group2 = df1.groupby(['评分','地址'])
print(df1_group2.describe())
print(df1_group2.mean())
基于拆分进行筛选
df1_group2.get_group((5.0,'丰台')).mean() #返回series
print(df1_group2['评分'].describe())
print(df1_group2['名次'].max())
分组汇总
使用aggregate 可以简写为agg,可用axis指定汇总维度
‘’’
常见的汇总函数:count(),size(),sum(),mean(),median(),min(),max()
std(),var(),skew(),kurt(),quantile(),apply(),cov(),corr()
‘’’
print('使用aggregate 可以简写为agg,可用axis指定汇总维度')
print(df1.agg('median'))
print(df1_group1.地址.agg('count'))
print(df1_group1.agg(['count','mean','median']))
print(df1.agg(['count','mean','median']))
import numpy as np
print(df1.评分.agg(np.sum)) #引用非内置函数
print(df1_group1.评分.agg(np.sum))
def mymean(x):
return x.mean()
print(df1_group1.名次.agg(mymean))
def mydouble(x):
return x.sum() * 2
print(df1_group1.评分.agg(mydouble))
其他汇总方法
‘’’
df.pivot_table()
pd.crosstab()
‘’’
print(pd.crosstab(df1.地址,df1.评分))
长宽数据格式的相互转换(基于多重索引)
转换为最简格式
‘’’
df.stack(
level = -1 :需要处理的索引级别,默认是全部int/string/list
dropna = True :是否删除为缺失值的行
)#转换后的结果可能为series
‘’’
df3 = pd.read_excel('123.xlsx',index_col = [0,2])
print(df3)
df3_stack = df3.stack()
print(df3_stack)
长宽型互转
‘’’
df.unstack(
level = :需要处理的索引级别,默认全部 int/string/list
fill_value:用于填充缺失值得数据
)
‘’’
df3_unstack = df3_stack.unstack(1)
print(df3_unstack)
df3_unstack = df3_stack.unstack([1,2])
print(df3_unstack)
数据转至 df.T
print(df3.T)
长宽互转的其他命令
‘’’
df.melt()
df.pivot()
df.povot_table()
‘’’
数据纵向合并
‘’’
df.append(
other:希望添加的DF/series/字典/上述对象的列表
使用列表方式,就可以一次合并多个新对象
ignore_index = False:添加时是否忽略索引
verify_integrity = False :是否检查索引值的唯一性,有重复时报错
)
‘’’
dfa1 = pd.read_excel('123.xlsx',sheet_name = 1)
dfa2 = pd.read_excel('123.xlsx',sheet_name = 2)
dfa3 = pd.read_excel('123.xlsx',sheet_name = 3)
dfa = dfa1.append([dfa2,dfa3[10:]],ignore_index = True)#避免了索引重复问题
print(dfa)
数据横向索引
‘’’
df.merge()
‘’’
dfa1 = pd.read_excel('123.xlsx',sheet_name = 1)
dfa2 = pd.read_excel('123.xlsx',sheet_name = 2)
print(pd.merge(dfa1,dfa2)) #自己寻找相同变量,进行内连接
print(pd.merge(dfa1,dfa2[:10]))#只有10条
dfm_i1 = dfa1.set_index('店家')
dfm_i2 = dfa2.set_index('店家')
print(pd.merge(dfm_i1,dfm_i2,left_index = True,right_index = True))
dfm_i2.reset_index(inplace = True)
print(pd.merge(dfm_i1,dfm_i2,left_index = True,right_on = '店家'))
dfm_i1.reset_index(inplace = True)
print(pd.merge(dfm_i1,dfm_i2,left_on = '店家',right_on = '店家'))
Concat 命令 典型的pandas命令
‘’’ 同时支持横向和纵向合并’’’
‘’’
pd.concat(
objs:需要合并的对象,列表形式提供
axis = 0 对行还是列合并
(0:index;1:colums)
join = ‘outer’ :对另一个轴向索引值如何进行处理
(‘inner’,‘outer’)
ignore_index = False
keys = None: 为不同数据源的提供合并后的索引值
verify_integrity = False
copy = True
)
‘’’
dfa1_3 = pd.concat([dfa1,dfa3]) #纵向合并
print(dfa1_3)
dfa1_2 = pd.concat([dfa1,dfa2]) #纵向合并
print(dfa1_2)
dfa1_3 = pd.concat([dfa1,dfa3],keys=['cat1','cat2']) #纵向合并
print(dfa1_3)
dfa1_2 = pd.concat([dfa1,dfa2],axis = 1) #横向合并 是基于索引的拼接有重复的‘店家列’
print(dfa1_2)
''' 正确做法应该先建索引'''
dfm_i11 = dfa1.set_index('店家')
dfm_i22 = dfa2.set_index('店家')
dfa11_22 = pd.concat([dfm_i11,dfm_i22],axis = 1) #
print(dfa11_22)