前言
在进行推荐系统实践中,为了保证测试集中的用户在训练集中是被训练过的,且有对应的item以供评价,将原始数据集构建user->item表,清洗数据筛选每个user有过行为的item数量大于5,最后以每个用户item数的4:1划分为训练集和测试集。代码
:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#读取数据
df_sample = pd.read_csv("../dataset/goodbook-10k/ratings.csv",names=['userid','itemid','ratings'],header=0)
df = df_sample
#筛选item数 > 5
userID = list(set(df['userid'].values))
itemID = list(set(df['itemid'].values))
userID=[userid for userid in userID if len(df[df['userid'] == userid]['itemid']) >=5] #获得item >5的user id
#划分数据集并保存为csv
test = []
train = []
for user in userID:
df1 = df[df['userid']==user]
trainset = df1.sample(frac = 0.8) #取60%为trainset
testset = df1[~df1['itemid'].isin(ts['itemid'])] #剩下的为testset, 利用isin获得对应的布尔列,通过取反得到剩下的
for i in trainset.values.tolist():
train.append(i)
for i in testset.values.tolist():
test.append(i)
name = ['userid','itemid','rating']
train = pd.DataFrame(columns = name, data = train)
test = pd.DataFrame(columns = name, data = test)
train.to_csv('./'+'rating_train.csv' , header=True, index=None)
test.to_csv('./'+'rating_test.csv' , header=True, index=None)