单词统计的例子:
hello,world
hello,bigdata
bigdata,is,very,good
代码实现:
package shujia;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MR_WC_Demo {
//第一阶段,map阶段
public static class MyMap extends Mapper<LongWritable,Text,Text,LongWritable> {
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//切分数据,这里处理的是一条数据
//将数据转换成String
String string = v1.toString();
//将一行数据切分成数组。
String[] words = string.split(",");
for (String string2 : words) {
//组装k2和v2
String kstring = string;
long v2 = 1L;
//将数据写出到磁盘
context.write(new Text(kstring), new LongWritable(v2));
//根据Key进行排序合并,相同key的数据写出到一起
}
}
}
//====================================shuffle=================================
//相同的key数据拉取到同一个reduce(函数)中,分组
//第二阶段:reduce阶段
public static class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
//重写reduce函数,一组数据调用一次reduce函数
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s, Context context) throws IOException, InterruptedException {
long s = 0L;
for (LongWritable longWritable:v2s){
//迭代求和
s+=longWritable.get();
}
//将结果输出,写到hdfs上
context.write(k2,new LongWritable(s));
}
//组装MapReduce
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//获取配置项
Configuration conf = new Configuration();
//获取job
Job job = Job.getInstance(conf, MR_WC_Demo.class.getSimpleName());
//设置打jar包的类
job.setJarByClass(MR_WC_Demo.class);
//设置输入hdfs文件路径:FileInputFormat
FileInputFormat.addInputPath(job,new Path(args[0]));
//指定map和reduce的序列化类型以及人任务类
job.setMapperClass(MyMap.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//指定输出hdfs的路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//提交任务
job.waitForCompletion(true);
}
}
}