Mapreduce :编程模型
编写MR:
mapper类:
package com.mao.hdfs.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* WCMapper
*/
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text keyOut = new Text();
IntWritable valueOut = new IntWritable();
String[] arr = value.toString().split(" ");
for (String s: arr){
keyOut.set(s);
valueOut.set(1);
context.write(keyOut,valueOut);
}
}
}
reduce类:
package com.mao.hdfs.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* WCReducer
*/
public class WCReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable iw : values){
count = count + iw.get();
}
context.write(key,new IntWritable(count));
}
}
App类:
package com.mao.hdfs.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
public class WCApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
// if (args.length>1){
// FileSystem.get(conf).delete(new Path(args[1]));
// }
Job job = Job.getInstance(conf);
//设置job属性
job.setJobName("WCApp"); //作业名称
job.setJarByClass(WCApp.class); //搜索类
job.setInputFormatClass(TextInputFormat.class); //设置输入格式
job.setOutputFormatClass(SequenceFileOutputFormat.class); //设置输出格式
FileInputFormat.addInputPath(job,new Path(args[0])); //设置输入路径
FileOutputFormat.setOutputPath(job,new Path(args[1])); //设置输出路径
//设置最大切片数
//FileInputFormat.setMaxInputSplitSize(job,13);
//设置最小切片数
//FileInputFormat.setMinInputSplitSize(job,1L);
job.setPartitionerClass(MyPartitioner.class); //设置自定义分区
job.setCombinerClass(WCReducer.class); //设置combiner类
job.setMapperClass(WCMapper.class); //mapper类
job.setReducerClass(WCReducer.class); //reduce类
job.setNumReduceTasks(1); //reduce个数
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
}
自定义分区类:
package com.mao.hdfs.mr;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MyPartitioner extends Partitioner<Text, IntWritable> {
public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
return 0;
}
}
/*
combinerd(合成) 继承了Reducer 任何的Reducer类都能被他使用
-----------------
Map端的Reducer 预先化简
1,为了减少网络带宽 将Map端发出的的数据进行聚合 并不是所有的都可以用combiner
*/
Local模式运行MR流程(LocalJobRunner,多线程方式模拟MR)
-------------------------
1.创建外部Job(mapreduce.Job),设置配置信息
2.通过jobsubmitter将job.xml + split等文件写入临时目录
3.通过jobSubmitter提交job给localJobRunner,
4.LocalJobRunner将外部Job 转换成成内部Job
5.内部Job线程,开放分线程执行job
6.job执行线程分别计算Map和reduce任务信息并通过线程池孵化新线程执行MR任务。
在hadoop集群上运行mrjob
-------------------------
1.导入jar包(在windows生成jar文件)
安装maven
2.丢到hadoop
3.运行hadoop jar命令
$>hadoop jar HdfsDemo-1.0-SNAPSHOT.jar com.mm.hdfs.mr.WCApp hdfs://s201/user/centos/wc/data hdfs://s201/user/centos/wc/out
注:
HdfsDemo-1.0-SNAPSHOT.jar //要运行的jar文件
com.mm.hdfs.mr.WCApp //启动类所在位置
hdfs://s201/user/centos/wc/data //hadoop 文件夹
hdfs://s201/user/centos/wc/out //输出文件夹