首先需要安装hadoop:http://www.jianshu.com/p/3064cc63d507,注意需要先安装jdk,还要在java和hadoop里面都要设置好相应的环境变量.
安装好之后就可以利用Hadoop Streaming进行编程了,首先要写好mapper和reducer文件,然后设置好相应的输入文件和输出文件。由于hadoop streaming是语言中立的,所以我们必须要从标准输入里面获取相应的字符串,然后再把它转换成相应的数据结构,这也是不太方便之处。
输入和输出: 输入是一个550M大小的打分文件,输出则是一个电影名字和分数列表(从高到底排序)。排序可以利用linux的命令sort进行(| sort -k -n -r…参数根据自己需求设置).:http://roclinux.cn/?p=1350
运行如下命令即可生成我们需要的输出文件:
./hadoop jar /usr/local/tools/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.7.3.jar -input /home/peng/Projects/Hadoop/ratings.csv -mapper /home/peng/Projects/Hadoop/mapper.py -reducer /home/peng/Projects/Hadoop/reducer.py -output /home/peng/Projects/Hadoop/output
#!/usr/bin/env python3
import sys
rate_count={}
rate_score={}
# input comes from STDIN (standard input)
for line in sys.stdin:
try:
line = line.strip().split(',')
ID, rating = int(line[1]),float(line[2])
if ID in rate_count:
rate_count[ID] +=1
rate_score[ID] +=rating
else:
rate_count[ID] =1
rate_score[ID] = rating
except:
pass
for ID in rate_count:
print(ID,'\t',rate_score[ID],'\t',rate_count[ID])
#!/usr/bin/env python3
from itertools import groupby
from operator import itemgetter
from functools import reduce
import csv
import sys
def read_mapper_output(file, separator='\t'):
for line in file:
line = line.split(separator)
ID = int(line[0])
rating,num = float(line[1]),int(line[2])
yield(ID,num,rating)
def read_movies():
movie = {}
with open(r'/home/peng/Projects/Hadoop/movies.csv') as csv_file:
file = csv.reader(csv_file)
for lines in file:
try:
ID,title = int(lines[0]),lines[1]
movie[ID] = title
except:pass
return movie
def main(separator='\t'):
data = read_mapper_output(sys.stdin, separator=separator)
movie = read_movies()
print(data)
for ID,group in groupby(data,itemgetter(0)):
total_count,total_rating = reduce(lambda item,next_: (item[0] +next_[1],item[1]+next_[2]),group,(0,0))
print('"'+movie[ID]+'"',total_rating/total_count)
if __name__ == '__main__':
main()
======================================================================================================================
优化: 可以利用json进行数据的传输,这样就不需要进行字符串的分析了.此外,可以利用heapq这个模块来构建一个TOPK的堆,因为我们只需要前100个数据,那么每一个reducer里面小于前100的数据就不可能是总的数据里面的前100.利用这个办法就可以在内存里面完成一部分的操作,最后输入到文件里面的数据量可能只有几百了,这样大大减少了IO操作(IO操作比较费时间)。
#!/usr/bin/env python3
import sys
import json
rate_count = {}
rate_score = {}
# input comes from STDIN (standard input)
for line in sys.stdin:
try:
line = line.strip().split(',')
ID, rating = int(line[1]), float(line[2])
if ID in rate_count:
rate_count[ID] += 1
rate_score[ID] += rating
else:
rate_count[ID] = 1
rate_score[ID] = rating
except:
pass
for ID in rate_count:
tmp = json.dumps((rate_score[ID], rate_count[ID]))
print(ID, '\t', tmp)
#!/usr/bin/env python3
from itertools import groupby
from operator import itemgetter
from functools import reduce
import heapq
import csv
import sys
import json
def read_mapper_output(file, separator='\t'):
for line in file:
line = line.split(separator)
ID,item = int(line[0]),json.loads(line[1])
#rating, num = float(line[1]), int(line[2])
yield(ID, item)
def read_movies():
movie = {}
with open(r'/home/peng/Projects/Hadoop/movies.csv') as csv_file:
file = csv.reader(csv_file)
for lines in file:
try:
ID, title = int(lines[0]), lines[1]
movie[ID] = title
except:
pass
return movie
class TOPK():
"""docstring for TOPK"""
def __init__(self, qsize):
super(TOPK, self).__init__()
self.size = qsize
self.data = []
def put(self,item):
if len(self.data)<self.size:#empty
heapq.heappush(self.data,item)
else:
smallest = self.data[0]
if smallest<item:
heapq.heapreplace(self.data,item)
def get_data(self):
return self.data
def main(separator='\t'):
data = read_mapper_output(sys.stdin, separator=separator)
movie = read_movies()
comparator = TOPK(100)
for ID, group in groupby(data, itemgetter(0)):
total_rating, total_count = reduce(lambda ini, item: (
ini[0]+ item[1][0],ini[1] + item[1][1]), group, (0, 0))
comparator.put((total_rating/total_count,ID))
for rating_,ID in comparator.get_data():
print('"' + movie[ID] + '"', '%.2f'%rating_)
if __name__ == '__main__':
main()