构建一个音乐推荐系统
- 基于商品相似性的推荐
- 基于SVD矩阵分解的推荐
一. 数据读取
import pandas as pd
import numpy as np
import time
import sqlite3
data_home = 'F:/study/'
triplet_dataset = pd.read_csv(filepath_or_buffer = data_home+'train_triplets.txt',
sep='\t', header=None, names=['user','song','play_count'])
print(triplet_dataset.shape)
triplet_dataset.info()
triplet_dataset.head(10)
(48373586, 3)
memory usage: 1.1+ GB
对每一个用户,分别统计他的播放总量
output_dict = {
}
with open(data_home + 'train_triplets.txt') as f:
for line_numer, line in enumerate(f):
user = line.split('\t')[0]
play_count = int(line.split('\t')[2])
if user in output_dict:
play_count += output_dict[user]
output_dict.update({
user:play_count})
output_dict.update({
user:play_count})
output_list = [{
'user':k, 'play_count':v} for k,v in output_dict.items()]
play_count_df = pd.DataFrame(output_list)
play_count_df = play_count_df.sort_values(by = 'play_count', ascending = False)
play_count_df.to_csv(path_or_buf = 'user_playcount_df.csv', index = False)
对于每一首歌,分别统计它的播放总量
output_dict = {
}
with open(data_home + 'train_triplets.txt') as f:
for line_number, line in enumerate(f):
song = line.split('\t')[1]
play_count = int(line.split('\t')[2])
if song in output_dict:
play_count += output_dict