统计出现的次数的方法:
#coding=utf-8 __author__ = 'mac' import numpy as np import pandas as pd import matplotlib.pyplot as plt import pylab as pl import os # import json path='/Users/mac/PycharmProjects/python2.7/BigData/pydata-book-2nd-edition/datasets/bitly_usagov/example.txt' records=[json.loads(line) for line in open(path)] print records[0]['tz'] time_zone=[rec['tz'] for rec in records if 'tz' in rec] print time_zone[:10]方法1:用纯python方法 def get_counts(sequence): counts={} for x in sequence: if x in counts: counts[x]+=1 else: counts[x]=1 return counts # #测试数据 # a=['a','b',1,'a','c','a'] # print get_counts(a) # print get_counts(time_zone[:10])方法2:用collections的defaultdict将字典初始化为0 #或用collections库 from collections import defaultdict def get_counts2(sequence): counts=defaultdict(int) #所有值都会被初始化为0 for x in sequence: counts[x]+=1 return counts # #测试数据 # a=['a','b',1,'a','c','a'] # print get_counts2(a) #计算time_zones的值 counts=get_counts(time_zone) print counts['America/New_York'] print len(time_zone) # a={'a':3,'b':5,'c':1,'d':3} #处理排在前10的时区 def top_counts(count_dict,n=10): value_key_pairs=[(count,tz) for tz,count in count_dict.items()] print value_key_pairs value_key_pairs.sort() #sort()升序,故下面从最后面的开始倒着取 return value_key_pairs[-n:] print top_counts(counts)方法3:用collections库中的Counter计数器,Counter.most_common从大到小排序取前10 #可以用collections的Counter类,它更简单 from collections import Counter counts=Counter(time_zone) #取counts中的前10的时区用most_common(10) print counts.most_common(10)方法4:引用数据类的库pandas的DataFrame,Series会带有index索引,和numpy中的array差不多 #使用pandas对时区进行计数 from pandas import DataFrame,Series frame=DataFrame(records) # print frame['tz'][:10] print frame['tz'] tz_counts=frame['tz'].value_counts() # print tz_counts #用fillna函数可以替换缺失值(NAN),而未知值(空字符串)则可以通过布尔型数组索引加以替换: clean_tz=frame['tz'].fillna('Missing') # print clean_tz clean_tz[clean_tz=='']='Unknown' # print clean_tz tz_counts=clean_tz.value_counts() print tz_counts[:10] print frame['a'][1] #plot(tz_counts[:10],kind='barh',rot=0) # pl.show()加上此才可以显示图形 results=Series([x.split()[0] for x in frame.a.dropna()]) #dropna()是删除缺失项 print results[:5]