mapreduce python编程实例
1 - mapreduce使用python WordCount实例
1.1 - mapper函数使用
vi mapper.py
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper.py
import sys
for line in sys.stdin: #读取标准输入
line = line.strip() #删除前导和尾随空白
words = line.split() #用split讲该行的单词分割成列表,每个单词就时一个列表项目,split的默认参数是空格,所以不传递任何参数时分割空格,在英文中也就等同于分割单词
for word in words:
print'%s\t%s'%(word,1)
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux" |python /hadoop/hadoop-2.6.0/python/mapper.py
foo 1
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
2.2 - reduce函数使用
vim reduce.py
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce.py
from operator import itemgetter //排序
import sys
word2count = {} #定义一个字典
for line in sys.stdin:
line = line.strip()
word,count = line.split('\t',1)
try:
count = int(count)
word2count[word] = word2count.get(word,0)+count #word2count.get(word,0),查找word键值,如果不存在返回0,如果存在返回键值
except ValueError:
pass
sorted_word2count = sorted(word2count.items(),key=itemgetter(0)) #用word2count.items()的第一个项目进行排序
for word,count in sorted_word2count:
print'%s\t%s'%(word,count)
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux"|python mapper.py|python reduce.py
bar 1
foo 3
labs 1
quux 2
1.3 - 在mapreduce执行
拷贝./share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar到hadoop目录
1 - mapreduce使用python WordCount实例
1.1 - mapper函数使用
vi mapper.py
#!/usr/bin/python
# _*_ coding:utf-8 _*_
#Filename:mapper.py
import sys
for line in sys.stdin: #读取标准输入
line = line.strip() #删除前导和尾随空白
words = line.split() #用split讲该行的单词分割成列表,每个单词就时一个列表项目,split的默认参数是空格,所以不传递任何参数时分割空格,在英文中也就等同于分割单词
for word in words:
print'%s\t%s'%(word,1)
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux" |python /hadoop/hadoop-2.6.0/python/mapper.py
foo 1
foo 1
quux 1
labs 1
foo 1
bar 1
quux 1
2.2 - reduce函数使用
vim reduce.py
#!/usr/bin/python
#_*_ coding:utf-8 _*_
#Filename:reduce.py
from operator import itemgetter //排序
import sys
word2count = {} #定义一个字典
for line in sys.stdin:
line = line.strip()
word,count = line.split('\t',1)
try:
count = int(count)
word2count[word] = word2count.get(word,0)+count #word2count.get(word,0),查找word键值,如果不存在返回0,如果存在返回键值
except ValueError:
pass
sorted_word2count = sorted(word2count.items(),key=itemgetter(0)) #用word2count.items()的第一个项目进行排序
for word,count in sorted_word2count:
print'%s\t%s'%(word,count)
[root@lsn-linux python]# echo "foo foo quux labs foo bar quux"|python mapper.py|python reduce.py
bar 1
foo 3
labs 1
quux 2
1.3 - 在mapreduce执行
拷贝./share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar到hadoop目录