一、Steaming
Map任务:
#!/bin/bash
awk 'BEGIN{
FS = "[ ,. ]"
OFS = "\t"
}{
for( i = 1; i <= NF; ++i)
{
dict[$i] += 1
}
}END{
for( key in dict)
{
print key,dict[key]
}
}'
Reducer任务:
#!/bin/bash
awk 'BEGIN{
OFS = "\t"
}{
dict[$1] += $2
}END{
for(key in dict)
{
print key,dict[key]
}
}'
启动脚本:
#!/bin/bash
hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output
hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \
-input /data/apps/zhangwenchao/mapreduce/streaming/wordcount/input \
-output /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output \
-mapper "sh -x mapper.sh" \
-reducer "sh -x reducer.sh" \
-file mapper.sh \
-file reducer.sh \
-jobconf mapred.job.name=wordcount \
-jobconf mapred.job.tasks=5 \
-jobconf mapred.reduce.tasks=3
二、Python
Map任务:
#! /usr/bin/python
import sys
import re
for line in sys.stdin:
wordlist=re.split('[;,.?]',line)
for words in wordlist:
words=words.strip()
tmp = words.split()
for item in tmp:
print "%s\t%s" % (item, 1)
Reducer任务:
#!/usr/bin/env python
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
line = line.strip()
word, count = line.split('\t', 1)
try:
count = int(count)
except ValueError:
continue
if current_word == word:
current_count += count
else:
if current_word:
print '%s\t%s' % (current_word,current_count)
current_count = count
current_word = word
if word == current_word:
print "%s\t%s" % (current_word, current_count)
启动脚本:
#!/bin/bash
hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/python/wordcount/output
hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \
-input /data/apps/zhangwenchao/mapreduce/python/wordcount/input \
-output /data/apps/zhangwenchao/mapreduce/python/wordcount/output \
-mapper "mapper.py" \
-reducer "reducer.py" \
-file mapper.py \
-file reducer.py \
-jobconf mapred.job.name=wordcount \
-jobconf mapred.job.tasks=5 \
-jobconf mapred.reduce.tasks=3
三、Java
Map:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
Reduce:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
Main:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String[] args) throws Exception {
String input = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/intput";
String output = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/output";
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJobName("test4");
job.setJarByClass(Main.class);
FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
job.setMapperClass(MyMap.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setCombinerClass(MyReduce.class);
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
}