MapReduce
- 创建map类继承Mapper类,重写map()方法
- 创建reduce类继承Reducer类,重写reduce()方法
- 创建主方法类继承Configured类实现Tool接口,重写run()方法,添加main()方法
WordCount案例
-
Map类
package com.bigdata.mapreduce; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /* 四个泛型: KEYIN:k1的类型 VALUEIN:v1的类型 KEYOUT:k2的类型 VALUEOUT:v2 的类型 */ public class wordCountMapper extends Mapper<LongWritable,Text, Text,LongWritable> { //map将k1,v1转换为k2,v2 /* key : 行偏移量 value :每一行的文本数据 context :上下文对象 */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Text text = new Text(); LongWritable longWritable = new LongWritable(); //将一行的文本数据进行拆分 String[] split = value.toString().split(","); //遍历数组,组装k2和v2 for (String word : split) { text.set(word); longWritable.set(1); //将k2和v2写入上下文中 // context.write(new Text(word),new LongWritable(1)); context.write(text,longWritable); } } }
-
Reduce类
package com.bigdata.mapreduce; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /* 四个泛型: KEYIN:k2类型 VALUEIN:V2类型 KEYOUT:k3类型 VALUEOUT:v3类型 */ public class wordCountReduce extends Reducer<Text, LongWritable, Text,LongWritable> { //将K2,V2转为K3,V3 将K3,V3写入上下文中 /* 参数: key:K2 value:V2 context:上下文对象 */ @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long count = 0; //遍历集合,将集合中数据相加 for (LongWritable value : values){ count += value.get(); } context.write(key,new LongWritable(count)); } }
-
main
package com.bigdata.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.net.URI; public class jobmain extends Configured implements Tool { //该方法用于指定一个job任务 @Override public int run(String[] strings) throws Exception { //创建一个job任务对象 Job job = Job.getInstance(super.getConf(), "wordcount"); //当jar无法运行时 job.setJarByClass(jobmain.class); //配置job任务对象 //1 指定读取方式和读取路径 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("file:///D:\\input")); //file:/// 此写法为本地模式 //hdfs:// 为集群模式 //2 指定map阶段的代码 job.setMapperClass(wordCountMapper.class); //设置map阶段K2的类型 job.setMapOutputKeyClass(Text.class); //设置map阶段V2的类型 job.setMapOutputValueClass(LongWritable.class); //3、4、5、6 shuffle 采取默认方式 //7 Reduce 阶段 指定reduce阶段的处理方式和数据类型 job.setReducerClass(wordCountReduce.class); //设置K3类型 job.setOutputKeyClass(Text.class); //设置V3类型 job.setOutputValueClass(LongWritable.class); //8 设置输出类型 job.setOutputFormatClass(TextOutputFormat.class); //设置输出路径 Path path = new Path("file:///D:\\output"); TextOutputFormat.setOutputPath(job,path); //获取FileSystem // FileSystem fileSystem = FileSystem.get(new URI("file:///D:\\output"), new Configuration()); // //判断目录是否存在 // boolean bol = fileSystem.exists(path); // if(bol){ // //删除目录 // fileSystem.delete(path,true); // } //等待任务结束 boolean b1 = job.waitForCompletion(true); return b1 ? 0 : 1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); //启动job任务 int run = ToolRunner.run(entries, new jobmain(), args); System.exit(run); } }
分区
-
创建partition类继承partitioner类,两个泛型K2,V2类型,重写getpartition()方法
-
getpartition方法,定义分区规则,返回对应的分区编号(Int)
-
主类中,在第4步指定分区类
job.setPartitionerClass(partition.class);
分区案例
数据格式:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0H3ookUW-1616402265944)(C:\Users\13918\AppData\Roaming\Typora\typora-user-images\image-20210214101957340.png)]
-
Mapper类
将一整行数据作为K2,没有value值,但需要返回一个value所以使用NullWritable.get()作为value占位,以谁作分区就应该把它包含在K2的字段里
package com.bigdata.mapreduce.partition; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class p_mapper extends Mapper<LongWritable, Text,Text, NullWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { context.write(value,NullWritable.get()); } }
-
Partitioner类
重写getParition方法参数为K2,V2
- 定义分区规则
- 返回对应分区编号
package com.bigdata.mapreduce.partition; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Partitioner; /* 两个泛型: KEY:K2类型 VALUE:V2类型 重写方法:getPartition */ public class p_partitioner extends Partitioner<Text, NullWritable> { /* 1. 定义分区规则 2. 返回对应的分区编号 按照原数据第7列,将大于15的放在一个分区 其余的放在另一个分区 根据map操作可知,text为一行文本数据 */ @Override public int getPartition(Text text, NullWritable nullWritable, int i) { //1. 拆分行文本数据获取原数据第7列,按“\t”分割下标为5 String num = text.toString().split("\t")[5]; //2. 判断数据与15的关系,返回对应的分区编号 if(Integer.parseInt(num) > 15){ return 1; }else{ return 0; } } }
-
Reduce类
无需做任何操作,将数据向后传递即可
package com.bigdata.mapreduce.partition; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class p_reducer extends Reducer<Text, NullWritable,Text,NullWritable> { @Override protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { context.write(key,NullWritable.get()); } }
-
主类
继承Configured类实现Tool接口,重写run方法,增加main方法。
main方法中启动job任务,使用ToolRunner.run()方法
run方法,3个步骤
-
创建job任务对象
-
job任务配置
8个步骤
- 设置输入类和输入路径
- 设置Mapper类和数据类型(K2,V2)
- 分区(设置reducecTask的个数)
- 排序
- 规约
- 分组
- 设置Reduce类和数据类型(K3,V3)
- 设置输出类和输出路径
-
等待任务结束
代码
package com.bigdata.mapreduce.partition; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class p_job extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { //1. 创建一个job任务对象 /* 两个参数 1. Configuration对象,可从父类中获得(同一个任务Configuration对象要是同一个) 2. 任务名称,自己设置 */ Job job = Job.getInstance(super.getConf(), "partitions"); //2. 对job任务进行配置(八个步骤) //1. 设置输入类和输入路径 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("hdfs://主机名:端口号/目录")); //2. 设置Mapper类和数据类型(K2,V2) job.setMapperClass(p_mapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); //3. 分区 job.setPartitionerClass(p_partitioner.class); //4.排序 5.规约 6. 分组 //7.设置Reduce类和数据类型(K3,V3) job.setReducerClass(p_reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //设置reducecTask的个数,不进行设置默认为0个分区 //分区个数按照需求进行设置 job.setNumReduceTasks(2); //8. 设置输出类和输出路径 job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path("hdfs://主机名:端口号/目录")); //3. 等待任务结束 boolean falg = job.waitForCompletion(true); return falg ? 0 : 1 ; } public static void main(String[] args) throws Exception { //这个Configuration会保存到Configured的父类中 Configuration entries = new Configuration(); //启动job任务 /* 三个参数 1. Configuration对象 2. Tool实现对象,使用nwe一个主类对象即可 3. 参数,使用args即可 */ int run = ToolRunner.run(entries, new p_job(), args); //返回值run为任务执行状态, System.exit(run); } }
-
MapReduce知识点(1)
MapReduce知识点(2)
MapReduce知识点(3)
MapReduce知识点(4)