基本没看懂,照着书把代码copy了一遍,简直呵呵
#!-*-coding:utf-8-*-
import json
path = "C:\Python learn\pydata-book-master\ch02\usagov_bitly_data2012-03-16-1331923249.txt"
records = [json.loads(line) for line in open(path)]
import pandas as pd
import numpy as np
frame = pd.DataFrame(records)
# print frame
# print frame['tz'][:10]
tz_counts = frame['tz'].value_counts()
# print tz_counts[0:10]
print
clean_tz = frame['tz'].fillna('Missing') #fillna函数可以替换缺失值NA
clean_tz[clean_tz == ''] = 'Unknow' #对列表中的未知值替换为unkonw
tz_counts = clean_tz.value_counts() #将整理过的tz数量赋值给tz_counts
# print tz_counts1[:10]
# tz_counts[:10].plot(kind = 'barh', rot = 0) #该命令需在pylab下运行(cmd-ipython-pylab)
# 查看a字段包含的浏览器等信息
print frame['a'][:10]
print frame['a'][0] #Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11
print frame['a'][50]
results = pd.Series([x.split()[0] for x in frame.a.dropna()]) #注意split()的用法,
print results[:5]
print results.value_counts()[:10]
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows') #有点像excel的if函数
print operating_system[:5]
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0) #unstack()对计数结果进行重塑,便于阅读;fillna()将计数中的空白值替换为0
print agg_counts[:10]
indexer = agg_counts.sum(1).argsort()
count_subset = agg_counts.take(indexer)[-10:]
count_subset.plot(kind = 'barh', stacked = True) #输出柱状堆积图
normed_subset = count_subset.div(count_subset.sum(1), axis = 0)
normed_subset.plot(kind = 'barh', stacked = True)