1、数据生成
# Faker库是Python中一个强大的Mock数据生成库,可以帮助开发者快速生成各种Mock数据,从而简化开发和测试过程。在开发项目中,我们经常需要生成一些虚拟数据,如虚拟用户、订单、地址等。为了简化这个过程,Python提供了Faker库,一个强大的Mock数据生成器。
import pandas as pd
import numpy as np
import calendar
import random
from faker import Faker
from faker.providers import BaseProvider, internet
# 自定义枚举
fake = Faker()
class MyProvider(BaseProvider):
def myCityLevel(self):
cl = ["一线", "二线", "三线", "四线+城市"]
return cl[random.randint(0, len(cl)-1)]
def myPost(self):
pt = ['数据分析师', '算法工程师', '产品经理', '数据开发', '前端开发', '用户运营', '设计师', '其他']
return pt[random.randint(0, len(pt)-1)]
def myGender(self):
g = ['F', 'M']
return g[random.randint(0, len(g)-1)]
fake.add_provider(MyProvider)
# 构造假数据
uid=[]
month=[]
cityLevel=[]
gender=[]
post=[]
income=[]
cost=[]
for i in range(1000):
uid.append(i)
month.append(calendar.month_abbr[int(fake.month())])
cityLevel.append(fake.myCityLevel())
gender.append(fake.myGender())
post.append(fake.myPost())
income.append(fake.random_int(min=3000, max=100000))
cost.append(fake.random_int(min=600, max=80000))
raw_data= pd.DataFrame({'uid':uid,
'month':month,
'cityLevel':cityLevel,
'gender':gender,
'post':post,
'income':income,
'cost':cost,
})
raw_data.head()
2、数据处理
# 构造数据格式分类1-分类2-value
lis = raw_data[['month', 'cityLevel', 'gender', 'post']].columns.tolist() # ['month', 'cityLevel', 'gender', 'post']
lis1 = lis[:-1] # ['month', 'cityLevel', 'gender']
lis2 = lis[1:] # ['cityLevel', 'gender', 'post']
df = pd.DataFrame()
for i in zip(lis1, lis2):
data_ = raw_data.pivot_table(index=list(i),values=['uid', 'income', 'cost'],
aggfunc={'uid':'count', 'income':np.sum, 'cost':np.sum}).reset_index()
data_.columns=[0, 1, 2, 3, 4]
df = pd.concat([df, data_], axis=0, ignore_index=True)
df
3、事件流分析-桑基图
from pyecharts import options as opts
from pyecharts.charts import Sankey
def plot_sankey(df, orient=None):
'''
df:数据集,仅含分类1-分类2-value三列
orient:方向(默认为'horizontal')vertical为垂直方向
'''
if orient=='vertical':
position='top'
else:
position='right'
# 生成nodes
nodes = []
# 添加所有
for i in set(list(df[0].unique())+list(df[1].unique())):
dic = {}
dic['name'] = i
nodes.append(dic)
# 定义节点和流量
links = []
for i in df.values:
dic = {}
dic['source'] = i[0]
dic['target'] = i[1]
dic['value'] = i[2]
links.append(dic)
# 绘图
sk = (
Sankey(init_opts=opts.InitOpts(width="1200px", height="800px",theme='westeros'))
.add(
"",
nodes=nodes,
links=links,
orient=orient,
layout_iterations=0,
linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
label_opts=opts.LabelOpts(position=position),
).set_global_opts(title_opts=opts.TitleOpts(title="岗位人数桑基图", pos_left = '40%'))
)
return sk.render_notebook()
df_sk = df[[0,1,4]] # 仅针对岗位人数绘制桑基图
plot_sankey(df_sk)