import pandas as pd
def getEmpDataFrame(num):
'''创建一份可复用的数据,有一定的随机性和真实性'''
#员工编号
emp = [''] * num
lenNum = len(str(num))
for i in range(num):
emp[i] = str(i+1).zfill(lenNum)
#性别:男多女少
sex = [1] * int(round(num*0.7)) + [0] * (num - int(round(num*0.7)))
#年龄:年龄是平均分布的,相对来说男同事比女同事要大些
age = [0] * num
for i in range(num):
if sex[i] == 1 :
age[i] = 28 + i % 33
if sex[i] == 0 :
age[i] = 22 + i % 33
#职级:越高级越罕有 与其他因素无关
lvl = [0] * num
for i in range(num):
if lvl[i] ==0 and (i+1) % 33 == 0 :
lvl[i] = 5
continue
if lvl[i] ==0 and (i+1) % 23 == 0 :
lvl[i] = 4
continue
if lvl[i] ==0 and (i+1) % 13 == 0 :
lvl[i] = 3
continue
if lvl[i] ==0 and (i+1) % 3 == 0 :
lvl[i] = 2
continue
lvl[i] = 1
#入职年长:跟职级和年龄有关 #通常4年升1级 #年龄-最低年龄=可能最大的入职年长
yrs = [0] * num
for i in range(num):
if sex[i] == 1 :
if lvl[i] * 4 >= age[i] - 28 :
yrs[i] = age[i] - 28
else:
yrs[i] = lvl[i] * 4
if sex[i] == 0 :
if lvl[i] * 4 >= age[i] - 22 :
yrs[i] = age[i] - 22
else:
yrs[i] = lvl[i] * 4
#学历:年龄小的平均学历相对高些,职级高的学历相对高些
edu = [0] * num
for i in range(num):
if lvl[i] == 5 or lvl[i] == 4 :
if age[i]
edu[i] = 4 #年轻高级是博士
else:
edu[i] = 3 #年老高级是硕士
else:
if age[i]
edu[i] = 2 #年轻低级是大学
else:
edu[i] = 1 #年老低级是大专
#工资:规则计算 加上一点随机变化 在加上一点入职时长的倍数鼓励
sal = [0.] * num
for i in range(num):
sal[i] = round( ( 3000 \
+ yrs[i] * 200 + edu[i] * 1000 + ( lvl[i] - 1 ) * 3000 + sex[i] * 1000 \
+ i % 7 * 300 ) \
* ( 1 + yrs[i] / 100 ) \
,3)
#离职风险:高低 #年轻大学生容易离职 #低学历大年龄且入职时间短容易被淘汰
ris = [0] * num
for i in range(num):
if yrs[i]
ris[i] = 1
if edu[i] == 1 and age[i] > 50 and yrs[i]
ris[i] = 1
df = pd.DataFrame({"sex":sex,
"age":age,
"lvl":lvl,
"yrs":yrs,
"edu":edu,
"sal":sal,
"ris":ris},
index = emp)
return df
# print(getEmpDataFrame(60))