import numpy as np
def main():
pass
def draw(data, label_list, whole, picture_name):
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
# 画布大小
fig = plt.figure(figsize=(40, 8), dpi=200)
# 横坐标
scale_ls = range(56)
with open("class.56", "r", encoding="utf8") as fin:
c = fin.readlines()
c = [ci.split("::###::")[2] for ci in c]
plt.xticks(scale_ls, c, fontproperties='SimHei')
# 样式
colors = np.asarray(['green', 'red', 'blue', "orange", "pink", "gray", "brown", "aqua", "blueviolet"])
sizes = np.asarray([10 for i in range(10)])
ax1 = fig.add_subplot(111)
for i in range(len(data)):
ax1.scatter([i for i in range(56)], data[i], c=colors[i], s=sizes[i], label=label_list[i])
ax1.set_ylabel('点击比例')
ax1.legend(loc=2)
ax2 = ax1.twinx()
ax2.plot([i for i in range(56)], whole, "-", color="black", label="人数")
ax2.set_ylabel('访问人数')
ax2.legend(loc=0)
plt.savefig(picture_name)
plt.show()
def find_age_period(age, seg_list):
for i, a in enumerate(seg_list):
if age < a:
return i
return "error"
def get_date(sample_path_list, feature_name):
feature = ["gender", "grade", "city_level", "age", "all"]
type_class = [3, 8, 8, 9, 1]
label = [["未知", "男", "女"], ["unknow", "doctor", "硕士", "本科", "高中", "初中", "小学", "大专"],
["北上广深", "二线城市", "三线城市", "四线城市", "五线城市", "港澳台", "国外", "未知"],
["0-12", "13-19", "20-25", "26-30", "31-40", "40-50", "51-60", "61-70", "70+"],
["click-ratio"]]
age_seg_list = [12, 19, 25, 30, 40, 50, 60, 70, 100000]
feature_dict = dict(zip(feature, type_class))
label_dict = dict(zip(feature, label))
if feature_name not in feature_dict.keys():
print("wrong feature")
return None
class_num = feature_dict[feature_name]
label_list = label_dict[feature_name]
data = [[[0, 0] for i in range(class_num)]for j in range(56)] # 有56个容器, 每个容器内部统计class_num的点击情况
for sample_path in sample_path_list:
with open(sample_path, "r", encoding="utf8") as fin:
for line in fin:
line = line.split("\t")
if line[2][:5] == "ERROR":
continue
label = int(float(line[-3]))
title = line[-2]
cur_title_vec = [float(v) for v in line[-1].split(",")]
class_index = int(np.argmax(cur_title_vec))
concat_feature_list = line[-4].split("||") # "age", "gender", "grade", "city_level"
concat_feature_dict = {}
cur_age = int(concat_feature_list[1])
concat_feature_dict["age"] = find_age_period(cur_age, age_seg_list)
concat_feature_dict["gender"] = int(concat_feature_list[2])
concat_feature_dict["grade"] = int(concat_feature_list[3])
concat_feature_dict["city_level"] = int(concat_feature_list[4])-1
if feature_name != "all":
value = concat_feature_dict[feature_name]
else:
value = 0
data[class_index][value][label] += 1
click_ratio = [[0 for i in range(56)] for j in range(class_num)]
whole = [0 for i in range(56)]
for i in range(56):
for j in range(class_num):
all = sum(data[i][j])
whole[i] += all
if all > 0:
click_ratio[j][i] = data[i][j][1]/all
return click_ratio, label_list, whole
if __name__ == "__main__":
feature = "gender"
sample_list = ["sample.20210115", "sample.20210116", "sample.20210117"]
click_ratio, label_list, whole = get_date(sample_list, feature)
draw(click_ratio, label_list, whole, feature+".png")
效果如下: