音乐系统-七月在线课程

该博客介绍了如何从网易云音乐爬取歌单数据,并解析为surprise库支持的格式,用于构建推荐系统。讲解了数据的获取、基础歌单数据的解析、保存关键信息以及使用协同过滤算法进行预测的过程。
摘要由CSDN通过智能技术生成

一、原始数据

利用爬虫技术获取网易云音乐歌单,存储为json格式,压缩后的全量数据共计16G,解压后超过50G。由于数据量太大,且代码中用不到这部分原始数据,故没有提供原始数据,此处仅展示原始数据的格式:
二、解析基础歌单数据

抽取 歌单名称,歌单id,收藏数,所属分类 4个歌单维度的信息
抽取 歌曲id,歌曲名,歌手,歌曲热度 等4个维度信息歌曲的信息
原始数据解析后的基础歌单数据 163_music_playlist.txt 最终要转换为surprise库支持的格式,故实际上用不到,此处没有提供;其格式与解析后的华语流行音乐歌单数据一样,如下所示:

三、将歌单数据解析为surprise库支持的格式

# 解析成userid itemid rating timestamp行格式

import json
import sys

def is_null(s): 
    return len(s.split(","))>2

def parse_song_info(song_info):
    try:
        song_id, name, artist, popularity = song_info.split(":::")
        #return ",".join([song_id, name, artist, popularity])
        return ",".join([song_id,"1.0",'1300000'])
    except Exception as e:
        #print e
        #print song_info
        return ""

def parse_playlist_line(in_line):
    try:
        contents = in_line.strip().split("\t")
        name, tags, playlist_id, subscribed_count = contents[0].split("##")
        songs_info = map(lambda x:playlist_id+","+parse_song_info(x), contents[1:])
        songs_info = filter(is_null, songs_info)
        return "\n".join(songs_info)
    except Exception as e:
        print(e)
        return False
        

def parse_file(in_file, out_file):
    out = open(out_file, 'w')
    for line in open(in_file):
        result = parse_playlist_line(line)
        if(result):
            out.write(result.strip()+"\n")
    out.close()

四、保存 歌单id=>歌单名 和 歌曲id=>歌曲名 的字典

import pickle
import sys

def parse_playlist_get_info(in_line, playlist_dic, song_dic):
    contents = in_line.strip().split("\t")
    name, tags, playlist_id, subscribed_count = contents[0].split("##")
    playlist_dic[playlist_id] = name
    for song in contents[1:]:
        try:
            song_id, song_name, artist, popularity = song.split(":::")
            song_dic[song_id] = song_name+"\t"+artist
        except:
            print("song format error")
            print(song+"\n")

def parse_file(in_file, out_playlist, out_song):
    #从歌单id到歌单名称的映射字典
    playlist_dic = {}
    #从歌曲id到歌曲名称的映射字典
    song_dic = {}
    for line in open(in_file):
        parse_playlist_get_info(line, playlist_dic, song_dic)
    #把映射字典保存在二进制文件中
    pickle.dump(playlist_dic, open(out_playlist,"wb")) 
    #可以通过 playlist_dic = pickle.load(open("playlist.pkl","rb"))重新载入
    pickle.dump(song_dic, open(out_song,"wb"))

Lesson2
基本使用方法如下

# 可以使用上面提到的各种推荐系统算法
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf

# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')
# k折交叉验证(k=3)
data.split(n_folds=3)
# 试一把SVD矩阵分解
algo = SVD()
# 在数据集上测试一下效果
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
#输出结果
print_perf(perf)

载入自己的数据集方法

# 指定文件所在路径
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
# 手动切分成5(方便交叉验证)
data.split(n_folds=5)

算法调参(让推荐系统有更好的效果)
这里实现的算法用到的算法无外乎也是SGD等,因此也有一些超参数会影响最后的结果,我们同样可以用sklearn中常用到的网格搜索交叉验证(GridSearchCV)来选择最优的参数。简单的例子如下所示:

# 定义好需要优选的参数网格
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
# 使用网格搜索交叉验证
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
# 在数据集上找到最好的参数
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
grid_search.evaluate(data)
# 输出调优的参数组 
# 输出最好的RMSE结果
print(grid_search.best_score['RMSE'])
# >>> 0.96117566386

# 输出对应最好的RMSE结果的参数
print(grid_search.best_params['RMSE'])
# >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}

# 最好的FCP得分
print(grid_search.best_score['FCP'])
# >>> 0.702279736531

# 对应最高FCP得分的参数
print(grid_search.best_params['FCP'])
# >>> {'reg_all': 0.6, 'lr_all': 0.005, 'n_epochs': 10}


Lesson3
使用协同过滤基于movielens数据集构建模型并进行预测

from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import cross_validate

# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')

algo = KNNWithMeans()
result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
"""
以下的程序段告诉大家如何在协同过滤算法建模以后,根据一个item取回相似度最高的item,主要是用到algo.get_neighbors()这个函数
"""
import os

from surprise import KNNBaseline
from surprise import Dataset


def read_item_names():
    """
    获取电影名到电影id 和 电影id到电影名的映射
    """

    file_name = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    with open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# 首先,用算法计算相互间的相似度
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)
# 获取电影名到电影id 和 电影id到电影名的映射
rid_to_name, name_to_rid = read_item_names()
# 拿出来Toy Story这部电影对应的item id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
# 找到最近的10个邻居
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
# 从近邻的id映射回电影名称
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid] for rid in toy_story_neighbors)

print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

Lesson4
使用协同过滤基于网易云音乐数据构建模型并进行预测

import os
import pickle
from surprise import KNNBaseline, Reader
from surprise import Dataset

path = "./data/output/popular/"

# 重建歌单id到歌单名的映射字典
id_name_dic = pickle.load(open( path+"popular_playlist.pkl","rb"))
print("加载歌单id到歌单名的映射字典完成...")
# 重建歌单名到歌单id的映射字典
name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("加载歌单名到歌单id的映射字典完成...")

file_path = os.path.expanduser(path+"popular_music_suprise_format.txt")
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲和歌曲之间的相似度
print("构建数据集...")
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}

基于用户的协同过滤

主要思想:找出和当前用户兴趣相近的用户,针对网易云音乐歌单数据而言,这里的用户就是歌单

print("开始训练模型...")
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNBaseline()

algo.fit(trainset)

current_playlist = list(name_id_dic.keys())[39]
print("歌单名称", current_playlist)

# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_playlist]
print("歌单id", playlist_id)
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print("内部id", playlist_inner_id)

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)

# 把歌曲id转成歌曲名字
# to_raw_uid映射回去
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
                       for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id]
                       for playlist_id in playlist_neighbors)

print()
print("和歌单 《", current_playlist, "》 最接近的10个歌单为:\n")
for playlist in playlist_neighbors:
    print(playlist, algo.trainset.to_inner_uid(name_id_dic[playlist]))

基于协同过滤的用户评分预测¶

import pickle
# 重建歌曲id到歌曲名的映射字典
song_id_name_dic = pickle.load(open(path+"popular_song.pkl","rb"))
print("加载歌曲id到歌曲名的映射字典完成...")
# 重建歌曲名到歌曲id的映射字典
song_name_id_dic = {}
for song_id in song_id_name_dic:
    song_name_id_dic[song_id_name_dic[song_id]] = song_id
print("加载歌曲名到歌曲id的映射字典完成...")
#内部编码的4号用户
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

基于矩阵分解的用户评分预测

## 使用NMF
from surprise import NMF
from surprise import Dataset

file_path = os.path.expanduser(path+'./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 构建数据集和建模
algo = NMF()
trainset = music_data.build_full_trainset()
algo.fit(trainset)
user_inner_id = 4
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
    print(algo.predict(algo.trainset.to_raw_uid(user_inner_id), algo.trainset.to_raw_iid(song), r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)])

模型保存与加载

import surprise
surprise.dump.dump('./model/recommendation.model', algo=algo)
# 可以用下面的方式载入
algo = surprise.dump.load('./model/recommendation.model')

不同的推荐系统算法评估

import os
from surprise import Reader, Dataset
# 指定文件路径
file_path = os.path.expanduser(path+'./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)

使用BaselineOnly

from surprise import BaselineOnly
algo = BaselineOnly()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

使用基础版协同过滤

from surprise import KNNBasic
algo = KNNBasic()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

使用均值协同过滤

from surprise import KNNWithMeans
algo = KNNWithMeans()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

使用协同过滤baseline

from surprise import KNNBaseline
algo = KNNBaseline()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

使用SVD

from surprise import SVD
algo = SVD()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

使用SVD++

from surprise import SVDpp
algo = SVDpp()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

使用NMF

from surprise import NMF
algo = NMF()
result = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值