#!/usr/bin/python
#coding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
def get_counts(sequence): # 用于统计总数
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
def top_counts(count_dict,n = 10): # 获取最大的十个数
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:] #注意这里最大的十个数是按从小到大顺序的
path = r'C:\Users\chenzhuo02\Desktop\硕士自学\pydata-book-2nd-edition\datasets\bitly_usagov\example.txt'
print(open(path).readline())
records = [json.loads(line) for line in open(path)] # 可将json字符串转化为python字典对象
print(records[0]['tz'])
time_zones = [rec['tz'] for rec in records if 'tz' in rec] # 获取到所有tz的数据组成一个新的链表
print(time_zones[:10])
counts = get_counts(time_zones)
print(counts["America/New_York"])
print(top_counts(get_counts(time_zones), 10))
from collections import Counter
counts = Counter(time_zones) # 这个count方法可以统计每一个数据的次数
print(counts.most_common(10)) # most_common可以获取到最大的几个
# 用pandas对时区进行计数
print('用pandas对时区进行计数')
from pandas import DataFrame, Series
import pandas as pd; import numpy as np
frame = DataFrame(records) #转化为pandas的dataframe类型
print(frame['tz'].value_counts()[:10])
clean_tz = frame['tz'].fillna('Missing')#替换缺失值NA
clean_tz[clean_tz == ''] = 'Unknown' # 未知值
tz_counts = clean_tz.value_counts()
print(tz_counts[:10])
tz_counts[:10].plot(kind='barh', rot = 0)
# 用pandas对时区进行计数 两个类别分别计数
results = Series(x.split()[0] for x in frame.a.dropna())# 找到frame中为a的那一列并去除其中的na项
print(results[:5])
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
print (operating_system[:5])
by_tz_os = cframe.groupby(['tz',operating_system])#根据operating_system进行分类
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:10])
index = agg_counts.sum(1).argsort()
count_subset = agg_counts.take(index)[-10:]
print(count_subset)