- 导入csv文件
- 填充空值
- 区间分类
- 透视表
- 透视结果做百分比
- 指定列按category排序
- 导出excel
- 对导出文件的特定列设置百分比格式
import pandas as pd
import interval
import numpy as np
def data_divide(x):
if x in interval.Interval(0,18,upper_closed=False):
return "0)不满18岁"
elif x in interval.Interval(18,30,upper_closed=False):
return "1)30岁以下"
elif x in interval.Interval(30,40,upper_closed=False):
return "2)40岁以下"
elif x in interval.Interval(40,50,upper_closed=False):
return "3)50岁以下"
elif x in interval.Interval(50,100,upper_closed=False):
return "4)50岁以上"
else:
return "5)100+"
pathstart='/Users/kangyongqing/Documents/kangyq/202303/分析模版/教师档案分析/'
path1=pathstart+'20230306_105852.csv'
df=pd.read_csv(path1)
df['地域'].fillna('未知',inplace=True)
# print(df.isnull().sum())
# toushi=df.pivot_table(index='学历',values='教师id',aggfunc='count',margins=True,margins_name='总计')
# print(toushi)
tongji=df.groupby('学历')['教师id'].count()
print('在岗总教师人数:',df.shape[0])
print("学历未知的人数:",tongji['未知'])
print("学历未知占比:",round(tongji['未知']/df.shape[0],2))
tongji1=df['地域'].isnull().sum()
print("地域未知人数:",tongji1)
print("地域未知占比:",round(tongji['未知']/df.shape[0],2))
#学历分布
xueli_ts=df.pivot_table(index='学历',values='教师id',aggfunc='count',margins=True,margins_name='总计')
xueli_ts['占比']=np.round(xueli_ts['教师id']/xueli_ts.loc['总计','教师id'],2)
xueli_ts.reset_index(inplace=True)
xueli_ts['学历']=xueli_ts['学历'].astype('category')
list_xueli=['未知','专科','本科','硕士','博士','总计']
xueli_ts['学历'].cat.reorder_categories(list_xueli,inplace=True)
xueli_ts.sort_values('学历',inplace=True)
xueli_ts.set_index('学历',inplace=True)
print(xueli_ts)
#性别分布
xingbie_ts=df.pivot_table(index='性别',values='教师id',aggfunc='count',margins=True,margins_name='总计')
xingbie_ts['占比']=np.round(xingbie_ts['教师id']/xingbie_ts.loc['总计','教师id'],2)
print(xingbie_ts)
#年龄分布
df['年龄区间']=df['年龄'].apply(data_divide)
nianling_ts=df.pivot_table(index='年龄区间',values='教师id',aggfunc='count',margins=True,margins_name='总计')
nianling_ts['占比']=np.round(nianling_ts['教师id']/nianling_ts.loc['总计','教师id'],2)
print(nianling_ts)
#全兼职分布
quanjianzhi_ts=df.pivot_table(index='是否全职',values='教师id',aggfunc='count',margins=True,margins_name='总计')
quanjianzhi_ts['占比']=np.round(quanjianzhi_ts['教师id']/quanjianzhi_ts.loc['总计','教师id'],2)
print(quanjianzhi_ts)
#开放状态分布
kaifang_ts=df.pivot_table(index='开放状态',values='教师id',aggfunc='count',margins=True,margins_name='总计')
kaifang_ts['占比']=np.round(kaifang_ts['教师id']/kaifang_ts.loc['总计','教师id'],2)
print(kaifang_ts)
#地域分布
df['地域截取']=df['地域'].str[0:2]
diyu_ts=df.pivot_table(index='地域截取',values='教师id',aggfunc='count',margins=True,margins_name='总计')
diyu_ts['占比']=np.round(diyu_ts['教师id']/diyu_ts.loc['总计','教师id'],2)
print(diyu_ts)
writer=pd.ExcelWriter(pathstart+'教师档案分析.xlsx')
xueli_ts.to_excel(writer,sheet_name='学历')
xingbie_ts.to_excel(writer,sheet_name='性别')
nianling_ts.to_excel(writer,sheet_name='年龄')
quanjianzhi_ts.to_excel(writer,sheet_name='全兼职')
kaifang_ts.to_excel(writer,sheet_name='开放状态')
diyu_ts.to_excel(writer,sheet_name='地域')
formatobj=writer.book.add_format({'num_format':'0.00%'})
for sheet in writer.sheets:
writer.book.sheetnames[sheet].set_column('C:C',cell_format=formatobj)
writer.save()