读取文件rating.txt时将该文件放在程序的同一个文件夹中,才能直接引用文件名读取
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 1
data = pd.read_csv('rating.txt', sep='\t', names=['userld', 'itemld', 'rating', 'timestamp'])
def sort_data(index, data=[]):
# 数据转换成矩阵
data1 = pd.DataFrame(data[index])
# 删除重复项
data1.drop_duplicates(inplace=True)
# 删除后排序,并键值对换
data2 = pd.Series(range(len(data1)), index=data1[index])
# 将data中的值和data2中的键对应
for x in range(len(data)):
data[index][x] = data2[data[index][x]]
sort_data('userld', data)
sort_data('itemld', data)
data.to_csv('ratingNew.txt', sep='\t', header=None, index=False)
# 2
df = pd.read_csv('ratingNew.txt', sep='\t', names=['userld', 'itemld', 'rating', 'timestamp'])
# 提取百分之80的数据
df1 = df.sample(frac=0.8, replace=False)
# 反向索引
df2 = df[~df.index.isin(df1.index)]
df1.to_csv('train.txt', sep='\t', header=None, index=False)
df2.to_csv('test.txt', sep='\t', header=None, index=False)
# 3
dat = pd.read_csv('rating.txt', sep='\t', names=['userld', 'itemld', 'rating', 'timestamp'])
# 随机抽取用户
user = np.random.randint(0, 942)
# 提取随机到用户的所有数据
user_data = dat[dat['userld'] == user]
user_score = []
for x in range(1, 6):
user_score.append(len(user_data[user_data['rating'] == x]))
score = list(range(1, 6))
# score为x轴,user_score为y轴
picture = plt.bar(score, user_score)
# 柱形图上标明数据
for x, y in zip(score, user_score):
plt.text(x, y, '%d' % y, ha='center', va='bottom')
plt.show()
运行结果如下