目录
合并 concat 、merge
import pandas as pd
import numpy as np
df1 = pd.DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=['A','B','C'])
df1
df2 = pd.DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=['A','D','C'])
df2
# 合并列 根据行索引对齐
pd.concat((df1,df2),axis=1)
# 默认是合并行 根据列索引对齐
pd.concat((df1,df2))
df1 = pd.DataFrame({'employee':['Bob','Jake','Lisa'],'group':['Accounting','Enginerring','Enginerring']})
df1
df2 = pd.DataFrame({'employee':['Lisa','Bob','Jake'],'hire_date':[2004,2008,2012]})
df2
# 合并列,按 employee 对齐
pd.merge(df1,df2,on='employee')
df3 = pd.DataFrame({'employee':['Lisa','Jake'],'group':['Accounting','Engineering'],'hire_date':[2004,2016]})
df3
df4 = pd.DataFrame({'group':['Accounting','Engineering','Engineering'],'supervisor':['Carly','Guido','Steve']})
df4
# 没有指定对齐的列,会根据相同的列名进行对齐
pd.merge(df3,df4)
df5 = pd.DataFrame({'animal':['pig','monkey','cat','dog'],'number':[10,15,20,8]})
df5
df6 = pd.DataFrame({'pet':['dog','cat','bird'],'people':['Jake','Rose','Joe']})
df6
# 两个数据中要对齐的列名不同的情况
pd.merge(df5,df6,left_on='animal',right_on='pet')
df7 = pd.DataFrame({'date':['2022-01-01','2022-01-02','2022-01-03'],'val':[12,34,5]})
df7.set_index('date',inplace=True)
df7
df8 = pd.DataFrame({'date_00':['2022-02-01','2022-01-02','2022-01-03'],'val_00':[2,39,90]})
df8.set_index('date_00',inplace=True)
df8
# 两个数据按索引对齐
pd.merge(df7,df8,left_index=True,right_index=True)
替换 replace
# 替换操作
df = pd.DataFrame(data=np.random.randint(0,100,size=(5,6)))
df
# 将 数值为92 的值替换为 Nity-Two
df.replace(to_replace=92,value='Nity-Two')
# 将 数值为92 的值替换为 Nine_two
df.replace(to_replace={92:'Nine_two'})
df1 = pd.DataFrame({'A':[0,3,4,0,6],'B':[1,1,2,2,6]})
df1
# 将 A 列中的0 替换为 -2
df1['A'].replace(0,-2)
映射 map
# 映射操作 map只能用被Series调用
dic = {
'name':['张三','李四','张三'],
'salary':[15000,20000,15000]
}
df = pd.DataFrame(data=dic)
df
dic = {
'张三':'tom',
'李四':'jack'
}
df['e_name'] = df['name'].map(dic)
df
# 运算工具 map只能用被Series调用
# 超过3000部分的钱缴纳50%的税,计算每个人的税后薪资
def after_sal(s):
return s-(s-3000)*0.5
df['after_sal']=df['salary'].map(after_sal)
df
打乱(随机抽样) take
# 随机抽样 take
df = pd.DataFrame(data=np.random.randint(0,100,size=(100,3)),columns=['A','B','C'])
df
# 生成乱序的随机序列
np.random.permutation(10)
# out: array([9, 1, 3, 8, 0, 5, 4, 2, 7, 6])
# 将原始数据打乱
df.take(np.random.permutation(3),axis=1).take(np.random.permutation(100),axis=0)
分组 group
# 分类 group
df = pd.DataFrame({'item':['Apple','Banana','Orange','Banana','Orange','Apple'],
'price':[4,3,3,2.5,4,2],
'color':['red','yellow','yellow','green','green','green'],
'weight':[12,20,50,30,20,44]})
df
# 查看详细的分组情况
df.groupby(by='item').groups
# 计算每一种水果的平均价格
df.groupby(by='item')['price'].mean()
# 计算每一种颜色对应水果的平均重量
df.groupby(by='color')['weight'].mean()
# 将计算出的平均重量汇总到源数据
dic = df.groupby(by='color')['weight'].mean().to_dict()
df['mean_w'] = df['color'].map(dic)
df
def my_mean(s):
m_sum = 0
for i in s:
m_sum += i
return m_sum/len(s)
# 每组的数据各自计算并将结果打到该组的每行数据上
df.groupby(by='item')['price'].transform(my_mean)