来自bit.ly的1.usa.gov数据
导入json格式文件
import json
path='D:/BaiduYunDownload/python/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]
records[0]['tz'] #Python索引从0开始,而R从1开始
Out[4]: u'America/New_York'
print records[0]['tz']
Out [5]: America/New_York
对时区进行计数
time_zones=[rec['tz'] for rec in records if 'tz' in rec] #不是所有记录都有时区,所以加上if条件后缀
time_zones[:10]
Out[10]:
[u'America/New_York',
u'America/Denver',
u'America/New_York',
u'America/Sao_Paulo',
u'America/New_York',
u'America/New_York',
u'Europe/Warsaw',
u'',
u'',
u'']
计数有两种方式,一种只用标准Python库,一种用pandas。
(1) 第一种
def get_counts(sequence):
counts={}
for x in sequence:
if x in counts:
counts[x]+=1
else:
counts[x]=1
return counts
更简洁:
from collections import defaultdict
def get_counts2(sequence):
counts=defaultdict(int) #所有的值均会被初始化为0
for x in sequence:
counts[x]+=1
return counts
counts=get_counts(time_zones)
counts['America/New_York']
Out[14]: 1251
len(time_zones)
Out[15]: 3440
取前十位的时区及其计数值:
def top_counts(count_dict,n=10):
value_key_pairs=[(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
Out[21]:
[(33, u'America/Sao_Paulo'),
(35, u'Europe/Madrid'),
(36, u'Pacific/Honolulu'),
(37, u'Asia/Tokyo'),
(74, u'Europe/London'),
(191, u'America/Denver'),
(382, u'America/Los_Angeles'),
(400, u'America/Chicago'),
(521, u''),
(1251, u'America/New_York')]
可用collections.Counter简化:
from collections import Counter
counts=Counter(time_zones)
counts.most_common(10)
Out[24]:
[(u'America/New_York', 1251),
(u'', 521),
(u'America/Chicago', 400),
(u'America/Los_Angeles', 382),
(u'America/Denver', 191),
(u'Europe/London', 74),
(u'Asia/Tokyo', 37),
(u'Pacific/Honolulu', 36),
(u'Europe/Madrid', 35),
(u'America/Sao_Paulo', 33)]
(2) 用pandas计数
Dataframe是pandas中最重要的数据结构。
from pandas import DataFrame,Series
import pandas as pd; import numpy as np
frame=DataFrame(records)
tz_counts=frame['tz'].value_counts()
tz_counts[:10]
Out[30]:
America/New_York 1251
521 #该处为空值
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64
找到空值并加名称:
clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'
tz_counts=clean_tz.value_counts()
tz_counts[:10]
Out[34]:
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
画图:
tz_counts[:10].plot(kind='barh',rot=0)
其他数据处理
· 把一部分数据分离出来
frame['a'][1]
Out[36]: u'GoogleMaps/RochesterNY'
frame['a'][50]
Out[37]: u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
results=Series([x.split()[0] for x in frame.a.dropna()]) #把字符串的第一节分离出来
results[:5]
Out[39]:
0 Mozilla/5.0
1 GoogleMaps/RochesterNY
2 Mozilla/4.0
3 Mozilla/5.0
4 Mozilla/5.0
dtype: object
· 区分Windows和非Windows用户
cframe=frame[frame.a.notnull()] #去掉缺失值
operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows') #含有Windows字符的作为Windows用户
operating_system[:5]
Out[42]:
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'], dtype='|S11')
by_tz_os=cframe.groupby(['tz', operating_system]) #根据条件分组
agg_counts=by_tz_os.size().unstack().fillna(0) #利用size对分组结果计算,并用unstack对计数结果进行重塑
agg_counts[:10]
Out[47]:
Not Windows Windows
tz
245.0 276.0
Africa/Cairo 0.0 3.0
Africa/Casablanca 0.0 1.0
Africa/Ceuta 0.0 2.0
Africa/Johannesburg 0.0 1.0
Africa/Lusaka 0.0 1.0
America/Anchorage 4.0 1.0
America/Argentina/Buenos_Aires 1.0 0.0
America/Argentina/Cordoba 0.0 1.0
America/Argentina/Mendoza 0.0 1.0
· 选取最常出现时区
indexer=agg_counts.sum(1).argsort()
count_subset=agg_counts.take(indexer)[-10:]
count_subset
Out[54]:
Not Windows Windows
tz
America/Sao_Paulo 13.0 20.0
Europe/Madrid 16.0 19.0
Pacific/Honolulu 0.0 36.0
Asia/Tokyo 2.0 35.0
Europe/London 43.0 31.0
America/Denver 132.0 59.0
America/Los_Angeles 130.0 252.0
America/Chicago 115.0 285.0
245.0 276.0
America/New_York 339.0 912.0
画图:
count_subset.plot(kind='barh',stacked=True)
normed_subset=count_subset.div(count_subset.sum(1),axis=0)
normed_subset.plot(kind='barh',stacked=True)
MovieLens 1M数据集
读取数据
import pandas as pd
unames=['user_id', 'gender', 'age', 'occupation', 'zip']
users=pd.read_table('D:/BaiduYunDownload/python/pydata-book-master/ch02/movielens/users.dat', sep='::',header=None,names=unames)
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('D:/BaiduYunDownload/python/pydata-book-master/ch02/movielens/ratings.dat', sep='::',header=None,names=rnames)
mnames=['movie_id','title','genres']
movies=pd.read_table('D:/BaiduYunDownload/python/pydata-book-master/ch02/movielens/movies.dat', sep='::',header=None,names=mnames)
pandas的merge将三组数据合并
data=pd.merge(pd.merge(ratings, users), movies)
计算平均得分
mean_ratings=data.pivot_table(values='rating',index='title',columns='gender',aggfunc='mean')
mean_ratings[:5]
Out[82]:
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
滤掉评分数据不够250条的电影
先对title进行分组,再利用size()得到分组大小
ratings_by_title=data.groupby('title').size()
active_titles=ratings_by_title.index[ratings_by_title>=250]
mean_ratings=mean_ratings.ix[active_titles] #根据上面结果选取需要的行
对F降序,了解女性喜欢电影
top_female_ratings= mean_ratings.sort_index(by='F',ascending=False)
计算评分分歧
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sorted_by_diff=mean_ratings.sort_index(by='diff')
sorted_by_diff[::-1][:15] #对diff反序,得到男性更喜欢的电影
利用评分方差/标准差找到分歧最大的电影
rating_std_by_title=data.groupby('title')['ratings'].std()
rating_std_by_title=rating_std_by_title.ix[active_titles]
rating_std_by_title.order(ascending=False)[:10]
1880-2010年间全美婴儿姓名
import pandas as pd
names1880=pd.read_csv('D:/BaiduYunDownload/python/pydata-book-master/ch02/names/yob1880.txt',names=['name','sex','births'])
names1880.groupby('sex').births.sum()
Out[90]:
sex
F 90993
M 110493
Name: births, dtype: int64
数据合并
years=range(1880,2011)
pieces=[]
columns=['name','sex','births']
for year in years:
path='D:/BaiduYunDownload/python/pydata-book-master/ch02/names/yob%d.txt' % year
frame=pd.read_csv(path,names=columns)
frame['year']=year
pieces.append(frame)
names=pd.concat(pieces, ignore_index=True) #不必返回csv自带行号
#在year和sex上进行分类
total_births=names.pivot_table(index='year',columns='sex',aggfunc=sum)
total_births.tail()
Out[101]:
births
sex F M
year
2006 1896468 2050234
2007 1916888 2069242
2008 1883645 2032310
2009 1827643 1973359
2010 1759010 1898382
total_births.plot(title='Total births by sex and year')
插入一个prop列
def add_prop(group):
births=group.births.astype(float) #整数除法向下圆整
group['prop']=births/births.sum()
return group
names=names.groupby(['year','sex']).apply(add_prop)
np.allclose(names.groupby(['year','sex']).prop.sum(),1) #验证prop列和是否为1
Out[106]: True
取其中子集分析命名趋势
每对sex/year组合的前1000个名字
def get_top1000(group):
return group.sort_index(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
boys=top1000[top1000.sex=='M']
girls=top1000[top1000.sex=='F']
total_births=top1000.pivot_table(values='births',index='year',columns='name',aggfunc=sum)
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True, figsize=(12,10),grid=False,title="Number of births per year")
看起来这几个名字风光不再,但事实并非如此简单
评估命名多样性的增长
table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
计算名字多样性可以统计个年前1000个名字在总出生人数的比例
def get_quantile_count(group,q=0.5):
group=group.sort_index(by='prop',ascending=False)
return group.prop.cumsum().searchsorted(q)+1
diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity=diversity.unstack('sex')
diversity['M']=diversity['M'].astype('int')
diversity.head()
Out[139]:
sex F M
year
1880 38 14
1881 38 14
1882 38 15
1883 39 15
1884 39 16
diversity.plot(title="Number of popular names in top 50%")
女孩的多样性高于男孩。
最后一个字母分布变化
从name列去除最后一个字母
get_last_letter=lambda x:x[-1]
last_letters=names.name.map(get_last_letter)
last_letters.name='last_letter'
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()
Out[146]:
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter
a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0
b NaN 694.0 450.0 411.0 3912.0 38859.0
c 5.0 49.0 946.0 482.0 15476.0 23125.0
d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0
e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0
subtable.sum()
Out[147]:
sex year
F 1910 396416.0
1960 2022062.0
2010 1759010.0
M 1910 194198.0
1960 2132588.0
2010 1898382.0
dtype: float64
import matplotlib.pyplot as plt
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)
取男孩名中几个字母做时间序列:
letter_prop=table/table.sum().astype(float)
dny_ts=letter_prop.ix[['d','n','y'],'M'].T
dny_ts.head()
Out[159]:
last_letter d n y
year
1880 0.083055 0.153213 0.075760
1881 0.083247 0.153214 0.077451
1882 0.085340 0.149560 0.077537
1883 0.084066 0.151646 0.079144
1884 0.086120 0.149915 0.080405
dny_ts.plot()
变成女孩名字的男孩名
例如Lesley或Leslie,取“lesl”开头名字走势:
all_names=top1000.name.unique()
mask=np.array(['lesl' in x.lower() for x in all_names])
lesley_like=all_names[mask]
lesley_like
Out[165]: array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
filtered=top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
Out[168]:
name
Leslee 1082
Lesley 35022
Lesli 929
Leslie 370429
Lesly 10067
Name: births, dtype: int64
table=filtered.pivot_table('births',index='year',columns='sex',aggfunc='sum')
table=table.div(table.sum(1),axis=0)
table.tail()
Out[171]:
sex F M
year
2006 1.0 NaN
2007 1.0 NaN
2008 1.0 NaN
2009 1.0 NaN
2010 1.0 NaN
table.plot(style={'M':'k-','F':'k--'})