#encoding:utf8
import pandas as pd
'''
数据来源
https://grouplens.org/datasets/movielens/
READE是对数据文件的说明
当量100W
3个文件
users.dat 评分用户
ID 性别 年龄 职业 邮编
1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
6::F::50::9::55117
7::M::35::1::06810
8::M::25::12::11413
9::M::25::17::61614
10::F::35::1::95370
movies.dat 电影数据
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
ratings.dat 电影评分
用户id 电影id 评分 时间戳
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
1::1197::3::978302268
1::1287::5::978302039
1::2804::5::978300719
'''
#加载数据
#定义users.dat的列名
unames = ['user_id','gender','age','occupation','zip']
#定义ratings.dat的列名
rating_names = ['user_id','movie_id','rating','timestamp']
#定义movies.dat的列名
movie_names = ['movie_id','title','geres']
#注意分隔符
users = pd.read_table('E:\\myproject\\data_analy\\ml-1m\\ml-1m\\users.dat',sep='::',header=None,names=unames)
ratings = pd.read_table('E:\\myproject\\data_analy\\ml-1m\\ml-1m\\ratings.dat',sep='::',header=None,names=rating_names)
movies = pd.read_table('E:\\myproject\\data_analy\\ml-1m\\ml-1m\\movies.dat',sep='::',header=None,names=movie_names)
print(len(users))
print(len(ratings))
print(len(movies))
#里面的代号在readme中有说明
print(users.head(5))
print(ratings.head(5))
print(movies.head(5))
#合并
data = pd.merge(pd.merge(users,ratings),movies)
print(len(data))
print(data.head(5))
#查看userid=1的用户
print(data[data.user_id == 1])
#统计男女对电影的评分差异最大的
#得到男女评分的所有电影
ratings_by_gender = data.pivot_table(values='rating',index='title',columns='gender',aggfunc='mean')
print(ratings_by_gender.head(5))
#加列用于男女评分相减
ratings_by_gender['diff'] = ratings_by_gender.F - ratings_by_gender.M
print(ratings_by_gender.head(5))
#排序 降序和升序的不同体现的是男性和女性喜欢的电影
print(ratings_by_gender.sort_values(by='diff',ascending=True).head(10))
#定义评分人数最多(热门)
ratings_by_title = data.groupby('title').size()
print(ratings_by_title.head(10))
print(ratings_by_title.sort_values(ascending=False).head(10))
#热门电影,也就是评分最高的电影
mean_ratings = data.pivot_table(values='rating',index='title',aggfunc='mean')
print(mean_ratings.sort_values(ascending=False).head(10))
#但是这里会有一个情况如果评分人很少但是分很高也不能算热门电影
#取一个评分人数的排序也就是看的比较多的人
top_10_hot = ratings_by_title.sort_values(ascending=False).head(10)
#最热门的的10大电影的评分
print(mean_ratings[top_10_hot.index])
#评分最高的10大电影的评价次数 观影次数
top_10_score = mean_ratings.sort_values(ascending=False).head(10)
print(ratings_by_title[top_10_score.index])
#找出热度足够高的电影,评价此处大于1000的
hot_movies = ratings_by_title[ratings_by_title > 1000]
#找出热门电影的评分数
hot_movies_rating = mean_ratings[hot_movies.index]
#对找出的热门电影评分进行降序排序
top_10_good_movies = hot_movies_rating.sort_values(ascending=False).head(10)
print(top_10_good_movies)
#优化取出男女评分差异最大的
#这里是得到男女评分差异的数据,但是没有排除掉观影人数过少的情况
#首先合并三个数据
data = pd.merge(pd.merge(users,ratings),movies)
#得到评分的人数
total_score = data.groupby('title').size()
#做一个性别为列索引的透视
hot_ratings_by_gender = data.pivot_table(values='rating',index='title',columns='gender',aggfunc='mean')
#加上一列评分人数
hot_ratings_by_gender['user_no'] = data.groupby('title').size()
#加上一列性别的差异评分
hot_ratings_by_gender['diff'] = ratings_by_gender.F - ratings_by_gender.M
#取出评分人数大于1000
hot_diff = hot_ratings_by_gender[hot_ratings_by_gender['score'] > 1000]
#在大于评分人数1000的范围排序diff列这样就可以得到热门中男女评分差异最大的电影
top_10_hot_diff = hot_diff.sort_values(by='diff',ascending=False).head(10)
print(top_10_hot_diff)