数据集链接:example.txt
import json
import pandas
import seaborn
import numpy
def main():
# 读取json文件 来自Bitly的USA.gov数据
path = "./example.txt"
records = [json.loads(line) for line in open(path)]
# 将数据实例化为DataFrame
frame = pandas.DataFrame(records)
# 统计时区信息并通过seaborn实现可视化
statistic_tz(frame)
# 统计浏览器信息并通过seaborn实现可视化
statistic_a(frame)
def statistic_tz(frame):
clean_tz = frame["tz"].fillna("[Missing]") # 替换缺失值
clean_tz[clean_tz == ""] = "[Unknown]" # 替换未知值
tz_counts = clean_tz.value_counts()
subset = tz_counts[:10]
seaborn.set(rc={'figure.figsize': (16, 10)})
seaborn.barplot(y=subset.index, x=subset.values) # 绘图
def statistic_a(frame):
cframe = frame[frame.a.notnull()].copy() # 过滤掉浏览器字段为空的信息
cframe["os"] = numpy.where(cframe["a"].str.contains('Windows'), 'Windows', 'Not Windows') # 对操作系统进行标注
by_tz_os = cframe.groupby(['tz', 'os']) # 根据时区和操作系统进行分类
agg_counts = by_tz_os.size().unstack().fillna(0) # 将最内层索引置为列索引 并填充NaN
indexer = agg_counts.sum(1).argsort() # 统计各个时区的出现次数
count_subset = agg_counts.take(indexer[-10:]) # 选取最常出现的时区
count_subset = count_subset.stack()
count_subset.name = 'total'
count_subset = count_subset.reset_index() # 还原为整型索引
results = count_subset.groupby('tz').apply(norm_total) # 将数据转化为百分比
seaborn.set(rc={'figure.figsize': (16, 10)})
seaborn.barplot(x='normed_total', y='tz', hue='os', data=results) # 绘图
def norm_total(group):
group['normed_total'] = group.total / group.total.sum()
return group
if __name__ == "__main__":
main()