1.数据整合
import pandas as pd
df1 = pd.DataFrame({'id':[1,2,3,4,5],'name':['张三','李四','王五','丁一','赵二'],'age':[27,25,16,34,16],'gender':['男','女','女','男','男']})
df2 = pd.DataFrame({'Id':[1,2,2,4,5,6,7],'kemu':['科目1','科目1','科目1','科目2','科目3','科目2','科目2'],'score':[83,81,67,92,86,79,99]})
df3 = pd.DataFrame({'id':[1,3,5],'name':['张三','李四','王五'],'income':[8000,40001,20000]})
merge1 = pd.concat([df1,df2],axis = 1,join = 'inner')
merge2 = pd.concat([df1,df2],axis = 1,join = 'outer')
import os
os.chdir(r'C:\Users\wangyaqian\Desktop\Excel学习包')
order1 = pd.read_excel('1.xlsx',sheet_name = 0,header = 0,encoding='gbk')
order2 = pd.read_excel('1.xlsx',sheet_name = 2,header = 0,encoding='gbk')
order3 = pd.read_excel('1.xlsx',sheet_name = 3,header = 0,encoding='gbk')
data = pd.concat([order1,order2,order3],axis = 0,ignore_index = True)
data.reset_index(drop = True, inplace = True)
merge1 = pd.merge(left = df1, right = df2, how = 'right', left_on = 'id',right_on = 'Id')
merge2 = pd.merge(left = df1, right = df2, how = 'left', left_on = 'id',right_on = 'Id')
merge3 = pd.merge(left = df1, right = df2, how = 'outer', left_on = 'id',right_on = 'Id')
merge4 = pd.merge(left = df1, right = df2, how = 'inner', left_on = 'id',right_on = 'Id')
2.层次化索引
3.排序
import pandas as pd
import numpy as np
import os
os.chdir(r'C:\Users\wangyaqian\Desktop\Excel学习包')
order = pd.read_excel('test.xlsx',header = 0,sheet_name = 0,\
encoding = 'utf-8')
order.isnull()
np.sum(order.isnull(),axis = 0)
order.sort_values('语文',ascending=True,\
na_position='last',inplace=True)
order.reset_index(drop = True, inplace = True)
order.sort_values('语文','数学')
4.分类聚合
import pandas as pd
import numpy as np
import os
os.chdir(r'C:\Users\wangyaqian\Desktop\Excel学习包')
order = pd.read_excel('test.xlsx',header = 0,sheet_name = 0,\
encoding = 'utf-8')
var_name=['语文','数学','英语']
np.min(order[var_name],axis=0)
order[var_name].quantile([0,0.2,0.5,1])
order[var_name].describe()
order.describe(include = ['object'])
groupe = order.groupby('语文')
print(type(groupe))
groupe.count()
groupe.cumcount()
grouped = order[var_name].groupby(by = ['语文','数学'])
b = grouped.median().loc[([95,86]),:]
import pandas as pd
import numpy as np
import os
os.chdir(r'C:\Users\wangyaqian\Desktop\Excel学习包')
order = pd.read_excel('test.xlsx',header = 0,sheet_name = 0,\
encoding = 'utf-8')
var_name=['语文','数学','英语']
grouped = order[var_name].groupby('语文')
a = grouped.agg([np.mean,np.sum])
b = grouped.mean()
c = grouped.agg([np.mean,np.sum]).loc[[60,77],['数学']]
d = grouped.agg({'数学':np.mean,'英语':np.sum})
e = grouped.agg({'数学':[np.mean,np.median],'英语':np.sum})
def DoubleSUM(data):
s = data.sum()*2
return s
f = grouped.agg({'数学':DoubleSUM})
g = order.agg({'数学':DoubleSUM})
h = order[var_name].apply(np.sum,axis = 0)
j = order[var_name].apply(lambda x:x[2]-x[1],axis = 1)