看了南京大学的《用python玩转数据视频》,Python强大。代码做了些注释。慢慢逐渐深入。
import pandas as pd
import numpy as np
# Download url: https://files.grouplens.org/datasets/movielens/ml-100k.zip
# 读取文件
unames = ['user id', 'age', 'gender', 'occupation', 'zip code']
users = pd.read_csv('ml-100k/u.user', sep = '|', names = unames) # 读取文件read_csv,sep指定分隔符
rnames = ['user id', 'item id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep = '\t', names = rnames)
# 数据筛选
users_df = users.loc[:, ['user id', 'gender']] # loc进行数据选取
ratings_df = ratings.loc[:, ['user id', 'rating']]
rating_df = pd.merge(users_df, ratings_df) # 合并
# Way 1 - groupby() 计算标准差
result = rating_df.groupby('gender').rating.apply(pd.Series.std)
print(result)
# Way 1 - pivot_table() 透视表 动态排列,分类汇总,可以实现groupby功能 index是索引
result = pd.pivot_table(rating_df, index = ['gender'], values = 'rating', aggfunc = pd.Series.std)
print(result)
# Way 2 - groupby() 据user id,gender计算男女标准差
df_temp = rating_df.groupby(['user id', 'gender']).apply(np.mean)
result = df_temp.groupby('gender').rating.apply(pd.Series.std)
print(result)
# Way 2 - pivot_table()
gender_table = pd.pivot_table(rating_df, index = ['gender', 'user id'], values = 'rating')
Female_df = gender_table.query("gender == ['F']") # 获取女性的数据
Male_df = gender_table.query("gender == ['M']") # 获取男性的数据
Female_std = pd.Series.std(Female_df)
Male_std = pd.Series.std(Male_df) # 计算标准差
print('Gender', '\nF\t%.6f' % Female_std, '\nM\t%.6f' % Male_std) # 输出