In [1]:
import graphlab
In [2]:
song_data = graphlab.SFrame('song_data.gl')
#其他格式数据导入详见 https://dato.com/learn/userguide/sframe/sframe-intro.html
In [3]:
song_data.head()
Out[3]:
In [4]:
graphlab.canvas.set_target('ipynb') # 设置 canvas 视图输出目标
# 设定参数 ‘ipynb’,“Calling .show()”命令将会在 IPython Notebook 中渲染一个输出单元
In [5]:
song_data['song'].show() # 展示不同歌曲出现次数及占比
In [6]:
len(song_data)
Out[6]:
In [7]:
users = song_data["user_id"].unique()
In [8]:
len(users)
Out[8]:
In [9]:
train_data,test_data = song_data.random_split(.8, seed =0) # 将样本数据随机分为训练集和测试集两部分,其中训练集占比80%
In [10]:
popularity_model = graphlab.popularity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [11]:
popularity_model.recommend(users=[users[0]]) # 得到相应用户的推荐歌曲排名列表,默认输出前10名
Out[11]:
In [12]:
popularity_model.recommend(users=[users[1]])
Out[12]:
In [13]:
personalized_model = graphlab.item_similarity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [14]:
personalized_model.recommend(users=[users[0]])
Out[14]:
In [15]:
personalized_model.recommend(users=[users[1]])
Out[15]:
In [16]:
personalized_model.get_similar_items(['With Or Without You - U2'])
Out[16]:
In [17]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])
Out[17]:
In [18]:
%matplotlib inline
model_performance = graphlab.recommender.util.compare_models(test_data,
[popularity_model, personalized_model],
user_sample=0.05)
# 利用测试集对比不同推荐模型的预测或推荐效果,user_sample 设置进行效果预测的数据的抽样比例
In [19]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
pr_curves_by_model = [res['precision_recall_overall'] for res in model_performance]
pr_curve = pr_curves_by_model[0].sort('recall')
ax.plot(list(pr_curve['recall']), list(pr_curve['precision']),
'blue', label='M1')
pr_curve = pr_curves_by_model[1].sort('recall')
ax.plot(list(pr_curve['recall']), list(pr_curve['precision']),
'green', label='M2')
ax.set_title('Precision-Recall Averaged Over Users')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend()
fig.show()
In [20]:
dir(song_data) # 输出 SFrame 结构的数据 song_data 的所有属性
Out[20]:
In [21]:
users_foo = song_data[song_data["artist"]=="Foo Fighters"]
# 以乐队"Foo Fighters"为例,将变量"artist"名称为"Foo Fighters"的数据保存到“users_foo”中
# 即创建"Foo Fighters"的用户集合“users_foo”,下同。
In [22]:
users_foo = users_foo.unique()
In [23]:
users_foo
Out[23]:
In [24]:
users_kanye = song_data[song_data["artist"]=="Kanye West"]
In [25]:
users_kanye = users_kanye.unique()
In [26]:
users_taylor = song_data[song_data["artist"]=="Taylor Swift"]
In [27]:
users_taylor = users_taylor.unique()
In [28]:
users_gaga = song_data[song_data["artist"]=="Lady GaGa"]
In [29]:
users_gaga = users_gaga.unique()
In [30]:
len(users_gaga)
Out[30]:
In [31]:
len(users_foo)
Out[31]:
In [32]:
len(users_taylor)
Out[32]:
In [33]:
len(users_kanye)
Out[33]:
In [34]:
song_data[1]
Out[34]:
In [35]:
users_gaga = users_gaga["user_id"].unique()
In [36]:
len(users_gaga)
Out[36]:
In [37]:
users_kanye = users_kanye["user_id"].unique();
users_foo = users_foo["user_id"].unique();
In [38]:
users_taylor = users_taylor["user_id"].unique();
In [39]:
len(users_kanye)
Out[39]:
In [40]:
len(users_foo)
Out[40]:
In [41]:
len(users_taylor)
Out[41]:
In [42]:
listen_counts = song_data.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')})
# 按照'artist'分组,对相应'listen_count'求和,得到收听总数'total_count'
In [43]:
listen_counts.sort('total_count', ascending=False) # 输出前10名
Out[43]:
In [44]:
listen_counts.sort('total_count', ascending=True) # 输出后10名
Out[44]:
In [45]:
subset_test_users = test_data['user_id'].unique()[0:10000]
In [46]:
subset_recommendations = personalized_model.recommend(subset_test_users,k=1)
In [47]:
subset_recommendations.head()
Out[47]:
In [48]:
most_recommended = subset_recommendations.groupby(key_columns='song', operations={'total_count': graphlab.aggregate.COUNT()})
In [49]:
most_recommended.sort('total_count', ascending=False) # 输出前10名
Out[49]:
In [50]:
most_recommended.sort('total_count', ascending=True) # 输出后10名
Out[50]: