import pandas as pd
import threading
from pandas import Series
import time
start = time.perf_counter()
unames = [‘user_id’, ‘gender’, ‘age’, ‘occupation’, ‘zip’]
users = pd.read_table(‘users.dat’, sep=’::’, header=None, names=unames, engine=‘python’)
rnames = [‘user_id’, ‘movie_id’, ‘rating’, ‘timestamp’]
ratings = pd.read_table(‘ratings.dat’, sep=’::’, header=None, names=rnames, engine=‘python’)
mnames = [‘movie_id’, ‘title’, ‘geners’]
movies = pd.read_table(‘movies.dat’, sep="::", header=None, names=mnames, engine=‘python’)
通过切片查看是否正常工作
print(users[:5])
print(ratings[:5])
print(movies[:5])
将数据合并到一起
data = pd.merge(pd.merge(ratings, users, on=‘user_id’), movies, on=‘movie_id’)
print(data[:2])
计算每部电影评分得分
mean_ratings = data.pivot_table(‘rating’, index=‘title’, columns=‘gender’, aggfunc=‘mean’)
print(mean_ratings[:5])
对电影分组
ratings_by_tittle = data.groupby(‘title’).size()
print(ratings_by_tittle[:3])
过滤不够250条的电影数据
active_titles = ratings_by_tittle.index[ratings_by_tittle >= 250]
mean_ratings = mean_ratings.loc[active_titles]
top_female_ratings = mean_ratings.sort_values(by=‘F’, ascending=False)
计算评分分歧
mean_ratings[‘diff’] = mean_ratings[‘M’] - mean_ratings[‘F’]
sorted_by_diff = mean_ratings.sort_values(by=‘diff’)
根据电影名称分组的得分数据的标准差
ratings_by_tittle = data.groupby(‘title’)[‘rating’].std()
根据active_titles进行过滤
ratings_by_tittle = ratings_by_tittle.loc[active_titles]
根据值对series进行降序排列
ratings_by_tittle.sort_values(ascending=False)[:10]
elapsed = (time.perf_counter() - start)
print(elapsed)