In [1]:
import json
path='usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]
records[0]
Out[1]:
In [5]:
time_zone=[rec['tz'] for rec in records if 'tz' in rec]
time_zone[:10] #获取时区
Out[5]:
In [17]:
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
frame=DataFrame(records)
frame.head(15)
Out[17]:
In [8]:
tz_counts=frame['tz'].value_counts()
tz_counts
Out[8]:
In [23]:
clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown' #处理缺失数据
print(clean_tz)
tz_counts=clean_tz.value_counts()
tz_counts[:10]
Out[23]:
In [32]:
%pylab
tz_counts[:10].plot(kind='barh',rot=0)
Out[32]:
In [37]:
results=Series([x.split()[0] for x in frame.a.dropna()])
print(results[:5])
results.value_counts()[:6] #数一下各有多少个
Out[37]:
In [38]:
#统计Windows用户
cframe=frame[frame.a.notnull()]
operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]
Out[38]:
In [47]:
by_tz_os=cframe.groupby(['tz',operating_system])
by_tz_os.size().unstack().fillna(0) # 牛!
Out[47]:
In [7]:
#MovieLens 1M数据集
unames=['user_id','gender','age','occupation','zip']
users=pd.read_table('users.dat',sep='::',header=None,names=unames)
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('ratings.dat',sep='::',header=None,names=rnames)
mnames=['movie_id','title','genres']
movies=pd.read_table('movies.dat',sep='::',header=None,names=mnames)
In [8]:
users.head()
Out[8]:
In [9]:
ratings[:5]
Out[9]:
In [10]:
movies[:5]
Out[10]:
In [12]:
data=pd.merge(pd.merge(ratings,users),movies)
data
Out[12]:
In [13]:
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings.head()
Out[13]:
In [23]:
ratings_by_title=data.groupby('title').size() #根据电影名字分组
ratings_by_title[:10]
Out[23]:
In [31]:
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles #评分数据大于250条的电影名称
Out[31]:
In [39]:
mean_ratings=mean_ratings.loc[active_titles]
mean_ratings
Out[39]:
In [45]:
top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10] #女性最喜欢的十大电影排行
Out[45]:
In [51]:
#找出男女分歧最大的电影
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sorted_by_diff=mean_ratings.sort_values(by='diff') #sort_values排序
print(sorted_by_diff[:5])
print(sorted_by_diff[::-1][:5])
sorted_by_diff[-5:] #男女差别较大且 男性更喜欢的电影
Out[51]:
In [57]:
#不考虑性别因素 只找出分歧最大的电影
rating_std_by_title=data.groupby('title')['rating'].std()
print(rating_std_by_title[:5]) #每个电影打分的方差
rating_std_by_title=rating_std_by_title.loc[active_titles]
print(type(rating_std_by_title))
rating_std_by_title.sort_values(ascending=False)[:5]
Out[57]:
In [1]:
#全美婴儿姓名分析
names1880=pd.read_csv('yob1880.txt',names=['name','sex','births'])
names1880.head()
Out[1]:
In [62]:
names1880.groupby('sex')['births'].sum()
Out[62]:
In [2]:
years=range(1880,2011)
pieces=[]
columns=['names','sex','births']
for year in years:
path='yob%d.txt' %year
frame=pd.read_csv(path,names=columns)
frame['year']=year
pieces.append(frame)
#print(pieces)
names=pd.concat(pieces,ignore_index=True)
In [78]:
names.head()
Out[78]:
In [3]:
total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
total_births.head()
Out[3]:
In [87]:
import matplotlib.pyplot as plt
total_births.plot(title="Total births by sex and year")
plt.show()
In [29]:
def add_prop(group): #名字占比
births=group.births.astype(float)
group['prop']=births/births.sum()
return group
names=names.groupby(['year','sex']).apply(add_prop)
names.head()
Out[29]:
In [31]:
def get_top1000(group): #获取每年各男女前1000的
return group.sort_values(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
top1000[:10]
Out[31]:
In [32]:
boys=top1000[top1000['sex']=='M']
girls=top1000[top1000['sex']=='F']
total_births=top1000.pivot_table('births',index='year',columns='names',aggfunc=sum)
total_births.head() #每年叫这些名字的人数
Out[32]:
In [33]:
import matplotlib.pyplot as plt
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=True,title="Number of births per year")
plt.show()
In [39]:
table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum) #前1000名字占比
table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
plt.show() #每年前1000个名字占比逐渐减少 说明命名更加多样化
In [44]:
df=boys[boys['year']==2010]
df[:5]
df.info()
In [52]:
#最后一个字母的变革
get_last_letter=lambda x:x[-1]
last_letters=names.names.map(get_last_letter) #map匿名函数
last_letters.name='last_letter'
last_letters.head()
Out[52]:
In [55]:
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()
Out[55]:
In [56]:
subtable.sum() #男女各年总数
Out[56]:
In [59]:
letter_prop=subtable/subtable.sum().astype(float)
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)
plt.show()