OutputFormat
自定义OutputFormat
-
继承FileOutputFormat,重写getRecordWriter
-
自定义RecordWriter,重写write,close方法
-
MyoutputFormat类
package com.bigdata.mapreduce.outputformat; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Progressable; import java.io.IOException; public class MyOutputFormat extends FileOutputFormat<Text, NullWritable> { @Override public org.apache.hadoop.mapreduce.RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { //1. 获取目标文件输出流(两个) FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration()); FSDataOutputStream goodoutput = fileSystem.create(new Path("file:///D:/mapreduce_demo/good_comments/goods.txt")); FSDataOutputStream badoutput = fileSystem.create(new Path("file:///D:/mapreduce_demo/bad_comments/bads.txt")); //2. 将输出流传给MyRecordWriter MyRecordWriter myRecordWriter = new MyRecordWriter(goodoutput, badoutput); return myRecordWriter; } }
-
MyRecordWriter类
package com.bigdata.mapreduce.outputformat; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.jute.Record; import java.io.IOException; public class MyRecordWriter extends RecordWriter<Text, NullWritable> { private FSDataOutputStream goodoutput; private FSDataOutputStream badoutput; public MyRecordWriter(FSDataOutputStream goodoutput, FSDataOutputStream badoutput) { this.goodoutput = goodoutput; this.badoutput = badoutput; } /** * * @param text 行文本内容 * @param nullWritable * @throws IOException * @throws InterruptedException */ @Override public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException { //1. 获取原数据的第9个字段,为评论值 String[] split = text.toString().split("\t"); String numStr = split[9]; //2. 根据字段值,判断评论类型(好评、中评、差评),然后将对应的数据写入不同的文件夹中 if (Integer.parseInt(numStr) <= 1){ //好评或中评 goodoutput.write(text.toString().getBytes()); goodoutput.write("\r\n".getBytes()); }else{ //差评 badoutput.write(text.toString().getBytes()); badoutput.write("\r\b".getBytes()); } } @Override public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { goodoutput.close(); badoutput.close(); } }
-
mapper
package com.bigdata.mapreduce.outputformat; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class outputmapper extends Mapper<LongWritable, Text,Text, NullWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { context.write(value,NullWritable.get()); } }
-
jobmain
package com.bigdata.mapreduce.outputformat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class outputmain extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { Job job = Job.getInstance(super.getConf(), "outputmain"); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("file:///")); job.setMapperClass(outputmapper.class); job.setMapOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //使用自定义outputformat类 job.setOutputFormatClass(MyOutputFormat.class); MyOutputFormat.setOutputPath(job,new Path("file:///D:/mapreduce_demo/out")); boolean flag = job.waitForCompletion(true); return flag?0:1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); int run = ToolRunner.run(entries, new outputmain(), args); System.exit(run); } }
分组
需求:求每个订单消费额的Top1
order001 pat01 222.8
order001 pat05 25.8
order002 pat03 522.8
order002 pat04 112.4
order002 pat05 722.4
order003 pat01 222.8
-
定义一个orderBean,两个成员变量,订单ID和消费额,继承WritableComparable,重写compareTo方法将相同的订单按金额进行降序排序
package com.bigdata.mapreduce.group; import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; public class orderBean implements WritableComparable<orderBean> { private String pid; private Double price; @Override public int compareTo(orderBean o) { //比较订单ID,ID一致则排序订单金额(降序) int i = this.pid.compareTo(o.pid); if(i == 0){ i = o.price.compareTo(this.price); } return i; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeUTF(pid); dataOutput.writeDouble(price); } @Override public void readFields(DataInput dataInput) throws IOException { this.pid = dataInput.readUTF(); this.price = dataInput.readDouble(); } public String getPid() { return pid; } public void setPid(String pid) { this.pid = pid; } public Double getPrice() { return price; } public void setPrice(Double price) { this.price = price; } @Override public String toString() { return pid + '\t' + price ; } }
-
定义mapper类,将订单ID和消费额封装为orderBean对象作为K2,V1作为V2
package com.bigdata.mapreduce.group; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class orderMapper extends Mapper<LongWritable, Text,orderBean,Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //将行文本数据拆分 String[] split = value.toString().split("\t"); //创建orderBean对象,设置订单ID和金额 orderBean orderBean = new orderBean(); orderBean.setPid(split[0]); orderBean.setPrice(Double.valueOf(split[2])); context.write(orderBean,value); } }
-
创建分区类继承Partitioner重写getPartition方法,将相同商品ID的分到一个区
package com.bigdata.mapreduce.group; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Partitioner; import java.util.HashMap; public class orderPartitions extends Partitioner<orderBean, Text> { // private HashMap<String,Integer> hmap = new HashMap<>(); // private int num = 0; @Override public int getPartition(orderBean orderBean, Text text, int i) { //方法1 // String pid = orderBean.getPid(); // if(!hmap.containsKey(pid)){ // hmap.put(pid,num); // num++; // } // return hmap.get(pid); //方法2 return (orderBean.getPid().hashCode() & 2147483647) % i; } }
-
创建分组类继承WritableComparator类调用父类有参构造,重写compare方法
package com.bigdata.mapreduce.group; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; /** * 1. 继承WriteableComparator * 2. 调用父类的有参构造 * 3. 指定分组规则(重写方法) */ public class MyOrder extends WritableComparator { //创建无参构造调用父类有参构造 public MyOrder() { //传入orderBean的class,第二个参数表示是否创建orderBean的实例 super(orderBean.class,true); } //指定分组规则 @Override public int compare(WritableComparable a, WritableComparable b) { //对形参做强制类型转换 orderBean first = (orderBean)a; orderBean second = (orderBean)b; //指定分组规则,两对象商品ID进行比较相同的分在同一组 return first.getPid().compareTo(second.getPid()); } }
-
Reducer类遍历数据取出前n条数据
package com.bigdata.mapreduce.group; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class orderReducer extends Reducer<orderBean, Text,Text, NullWritable> { @Override protected void reduce(orderBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException { int i = 0; for (Text value : values){ context.write(value,NullWritable.get()); i++; if(i >= 1){ break; } } } }
-
jobmain
package com.bigdata.mapreduce.group; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class orderjobmain extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { //创建Job对象 Job job = Job.getInstance(super.getConf(), "orderjob"); //配置job对象 //配置输入类和输入路径 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\input\\groupdata.txt")); //mapper类、K2,V2类型 job.setMapperClass(orderMapper.class); job.setMapOutputKeyClass(orderBean.class); job.setMapOutputValueClass(Text.class); //分区类 job.setPartitionerClass(orderPartitions.class); // job.setNumReduceTasks(3); 不设置默认将数据放在一个文件中 //分组类 job.setGroupingComparatorClass(MyOrder.class); //reudcer类、K3,V3类型 job.setReducerClass(orderReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); //输出类、输出路径 job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo/orderoutput")); //等待任务结束 boolean flag = job.waitForCompletion(true); return flag?0:1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); int run = ToolRunner.run(entries, new orderjobmain(), args); System.exit(run); } }
MapReduce知识点(1)
MapReduce知识点(2)
MapReduce知识点(3)
MapReduce知识点(4)