数据集介绍
待补充
user id | item id | rating | timestamp.
每个用户的平均评分
map阶段
#coding=utf-8
import sys
for line in sys.stdin:
line = line.strip().split()
#userid rating
print "%s\t%s" % (line[0], line[2])
[root@DW1 code]# cat u.data|python rating_mapper.py
...
880 3
716 5
276 1
13 2
12 3
reduce阶段
#coding=utf-8
import sys
curr_userid = None
curr_rating = 0
count = 0
for line in sys.stdin:
userid, rating = line.strip().split('\t')
if not curr_userid:
curr_userid = userid
if userid != curr_userid:
print curr_userid + "\t" + str(curr_rating/count)
curr_rating = 0
count = 0
curr_userid = userid
curr_rating += int(rating)
count +=