join操作
左外连接(map)-JoinMapSideMR
问题描述:
将两个文件中每行的内容拼接到一个文件中
思路分析:
准备好两个map,firstMapper和joinMapper,firstMapper负责获取文件内容,joinMapper负责拼接文件内容。利用Job开启两个firstMapper任务,获取到两个文件的内容,然后再开启一个joinMapper任务负责拼接获取到的两个文件。
注:不常用map端的连接操作,推荐reduce端的连接操作
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | public class JoinMapSideMR extends Configured implements Tool { public static class FirstStepMapper extends Mapper<LongWritable, Text, Text, NullWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { if(!value.toString().equals("")) { context.write(value, NullWritable.get()); } } } //读取连接好的数据的mapper public static class JoinMapper extends Mapper<Text, TupleWritable, Text, Text>{ @Override protected void map(Text key, TupleWritable value, Context context) throws IOException, InterruptedException { String v = StreamSupport.stream(value.spliterator(), false).map(s -> ((Text) s).toString()) .collect(Collectors.joining("|")); context.write(key,new Text(v)); } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Path inpath1 = new Path(conf.get("inpath1")); Path inpath2 = new Path(conf.get("inpath2")); Path mr1 = new Path(conf.get("mr1")); Path mr2 = new Path(conf.get("mr2")); Path outpath = new Path(conf.get("outpath")); //------------------------ Job job1 = Job.getInstance(conf,"first_step1_xj"); job1.setJarByClass(this.getClass()); job1.setMapperClass(FirstStepMapper.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(NullWritable.class); job1.setReducerClass(Reducer.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(NullWritable.class); TextInputFormat.addInputPath(job1,inpath1); TextOutputFormat.setOutputPath(job1,mr1); FileOutputFormat.setOutputCompressorClass(job1,new GzipCodec().getClass()); //------------------------ Job job2 = Job.getInstance(conf,"first_step2_xj"); job2.setJarByClass(this.getClass()); job2.setMapperClass(FirstStepMapper.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(NullWritable.class); job2.setReducerClass(Reducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(NullWritable.class); TextInputFormat.addInputPath(job2,inpath2); TextOutputFormat.setOutputPath(job2,mr2); FileOutputFormat.setOutputCompressorClass(job2,new GzipCodec().getClass()); //------------------------ Job job3 = Job.getInstance(conf,"map_join_xj"); job3.setJarByClass(this.getClass()); job3.setMapperClass(JoinMapper.class); job3.setMapOutputKeyClass(Text.class); job3.setMapOutputValueClass(Text.class); job3.setNumReduceTasks(0); job3.getConfiguration().set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ","); String expr = CompositeInputFormat.compose("inner", KeyValueTextInputFormat.class, mr1, mr2); job3.getConfiguration().set("mapreduce.join.expr",expr); job3.setInputFormatClass(CompositeInputFormat.class); TextOutputFormat.setOutputPath(job3,outpath); List<Job> list = new ArrayList(); list.add(job1); list.add(job2); list.add(job3); for (Job job : list) { boolean succ = job.waitForCompletion(true); if(!succ){ System.out.println(job.getJobName()+":"+ job.getJobState().getValue()); break; } } return 0; } public static void main(String[] args)throws Exception { ToolRunner.run(new JoinMapSideMR(),args); } } |
左外连接(reduce)-JoinReduceSideMR
问题描述:
将两个文件中每行的内容拼接到一个文件中
思路分析:
准备好两个map,fistMapper和SecondMapper,两个map的key的输出类型都为复合类型,包含id和tag,另外准备两个类自定义分组和分区规则,只根据id来分组和分区。因此,这两个map的输出结果就会进入到同一个reduce中,最后在reduce中完成拼接操作。
复合类型-ArtistIDTag
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | public class ArtistIDTag implements WritableComparable<ArtistIDTag> { private Text ArtistID = new Text(); // id private IntWritable Tag = new IntWritable(); // 标记 public ArtistIDTag() { } public ArtistIDTag(Text artistID, IntWritable tag) { this.ArtistID = new Text(artistID.toString()); this.Tag = new IntWritable(tag.get()); } public Text getArtistID() { return ArtistID; } public void setArtistID(Text artistID) { this.ArtistID = new Text(artistID.toString()); } public IntWritable getTag() { return Tag; } public void setTag(IntWritable tag) { this.Tag = new IntWritable(tag.get()); } @Override public int compareTo(ArtistIDTag o) { return this.ArtistID.compareTo(o.ArtistID)==0 ? this.Tag.compareTo(o.Tag) : this.ArtistID.compareTo(o.ArtistID); } @Override public void write(DataOutput dataOutput) throws IOException { ArtistID.write(dataOutput); Tag.write(dataOutput); } @Override public void readFields(DataInput dataInput) throws IOException { ArtistID.readFields(dataInput); Tag.readFields(dataInput); } } |
自定义分区规则-ArtistPartitioner
1 2 3 4 5 6 | public class ArtistPartitioner extends Partitioner<ArtistIDTag, Text> { @Override public int getPartition(ArtistIDTag artistIDTag, Text text, int i) { return Math.abs(artistIDTag.getArtistID().hashCode()*127)%i; } } |
自定义分组规则-ArtistGroupComparator
1 2 3 4 5 6 7 8 9 10 11 12 | public class ArtistGroupComparator extends WritableComparator{ public ArtistGroupComparator() { super(ArtistIDTag.class,true); } @Override public int compare(WritableComparable a, WritableComparable b) { ArtistIDTag at1 = (ArtistIDTag) a; ArtistIDTag at2 = (ArtistIDTag) b; return at1.getArtistID().compareTo(at2.getArtistID()); } } |
连接-JoinReduceSideMR
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | public class JoinReduceSideMR extends Configured implements Tool { public static void main(String[] args) throws Exception { ToolRunner.run(new JoinReduceSideMR(),args); } public static class FirstMapper extends Mapper<LongWritable,Text,ArtistIDTag,Text>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Stream.of(value.toString()).filter(s -> s.length()>0).forEach(ExceptionConsumer.of(s -> { String id = s.substring(0,s.indexOf(",")); String info = s.substring(s.indexOf(",")+1,s.length()); context.write(new ArtistIDTag(new Text(id),new IntWritable(0)),new Text(info)); })); } } public static class SecondMapper extends Mapper<LongWritable,Text,ArtistIDTag,Text>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Stream.of(value.toString()).filter(s -> s.length()>0).forEach(ExceptionConsumer.of(s -> { String id = s.substring(0,s.indexOf(",")); String info = s.substring(s.indexOf(",")+1,s.length()); context.write(new ArtistIDTag(new Text(id),new IntWritable(1)),new Text(info)); })); } } public static class JoinReducer extends Reducer<ArtistIDTag,Text,Text,Text>{ @Override protected void reduce(ArtistIDTag key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Iterator<Text> ite = values.iterator(); String name = ite.next().toString(); while (ite.hasNext()){ Text count = ite.next(); String info = name.toString() + "|" + count.toString(); context.write(key.getArtistID(),new Text(info)); } } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf, "join_reduce_xj"); job.setJarByClass(this.getClass()); // 多任务输入 MultipleInputs.addInputPath(job,new Path(conf.get("inpath1")),TextInputFormat.class,FirstMapper.class); MultipleInputs.addInputPath(job,new Path(conf.get("inpath2")),TextInputFormat.class,SecondMapper.class); job.setMapOutputKeyClass(ArtistIDTag.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(JoinReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath"))); // 设置分区规则 job.setPartitionerClass(ArtistPartitioner.class); // 设置分组规则 job.setGroupingComparatorClass(ArtistGroupComparator.class); return job.waitForCompletion(true)? 0 : 1; } } |
DB交互操作
读取-DBtoHdfsMR
问题描述:
从mysql数据库中读取数据,并输出到hdfs
思路分析:
- 准备好一个实现了DBWritable接口的复合类型,在该类型中定义的属性分别对应数据库中的列名。
- 将该复合类型作为map阶段输入的value的类型即可。
- 让集群加载jdbc驱动类。
- 设置配置信息,连接到数据库。
- 将输入类型设置为DBInputFormat。
复合类型-YearStationTempDB
注:输入操作要实现WritableComparable接口,这里是读操作可以删除。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | public class YearStationTempDB implements DBWritable,WritableComparable<YearStationTempDB> { private int year; // 年份 private String station; // 气象站编号 private int temperature; // 气温 public YearStationTempDB() { } public YearStationTempDB(int year, String station, int temperature) { this.year = year; this.station = station; this.temperature = temperature; } @Override public void write(PreparedStatement prep) throws SQLException { prep.setInt(1,year); prep.setString(2,station); prep.setInt(3,temperature); } @Override public void readFields(ResultSet rs) throws SQLException { this.year = rs.getInt("year"); this.station = rs.getString("station"); this.temperature = rs.getInt("temperature"); } public int getYear() { return year; } public void setYear(int year) { this.year = year; } public String getStation() { return station; } public void setStation(String station) { this.station = station; } public int getTemperature() { return temperature; } public void setTemperature(int temperature) { this.temperature = temperature; } @Override public String toString() { return year + "," + station +"," + temperature; } @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeInt(year); dataOutput.writeUTF(station); dataOutput.writeInt(temperature); } @Override public void readFields(DataInput dataInput) throws IOException { year = dataInput.readInt(); station = dataInput.readUTF(); temperature = dataInput.readInt(); } @Override public int compareTo(YearStationTempDB o) { return this.year - o.year == 0 ? this.station.compareTo(o.station) : this.year - o.year; } } |
读取-DBtoHdfsMR
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | public class DBtoHdfsMR extends Configured implements Tool { public static void main(String[] args) throws Exception { ToolRunner.run(new DBtoHdfsMR(),args); } public static class DBMapper extends Mapper<LongWritable,YearStationTempDB,LongWritable,Text>{ @Override protected void map(LongWritable key, YearStationTempDB value, Context context) throws IOException, InterruptedException { context.write(key,new Text(value.toString())); } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf,"bdtohdfs_xj"); job.setJarByClass(this.getClass()); job.setMapperClass(DBMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); // 如何让集群加载jdbc驱动类 // 1 将jar放入share/hadoop/yarn/下会自动上传jar到集群 // 2 把jar放入集群中lib目录下,重启集群 // 3 job.addFileToClassPath(),需要把jar包上传到hdfs //job.addFileToClassPath(new Path("hdfs://172.16.0.4:9000/data/mysql-connector-java-5.1.38.jar")); // 连接数据库 DBConfiguration.configureDB(job.getConfiguration(),"com.mysql.jdbc.Driver", "jdbc:mysql://172.16.0.100:3306/hadoop","hadoop","hadoop"); job.setInputFormatClass(DBInputFormat.class); // 设置为DB输入类型 job.setOutputFormatClass(TextOutputFormat.class); // year = 2000是条件,表示输入year=2000的数据 DBInputFormat.setInput(job,YearStationTempDB.class,"station_tbl","year = 2000","","year","station","temperature"); TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath"))); job.setNumReduceTasks(1); return job.waitForCompletion(true)? 0 : 1; } } |
输入-HdfstoDBMR
问题描述:
从hdfs读取数据,并输出到mysql数据库
思路分析:
- 准备好一个实现了WritableComparable接口的复合类型,在该类型中定义的属性分别对应数据库中的列名,并重写compareTo()方法。
- 将该复合类型作为reduce阶段输出的key的类型即可。
- 让集群加载jdbc驱动类。
- 设置配置信息,连接到数据库。
- 将输出类型设置为DBOutputFormat。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | public class HdfstoDBMR extends Configured implements Tool { public static void main(String[] args) throws Exception { ToolRunner.run(new HdfstoDBMR(),args); } public static class HTDMapper extends Mapper<LongWritable,Text,YearStation,IntWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { NcdcRecordParser parser = new NcdcRecordParser(); parser.parse(()->value.toString()).ifPresent(ExceptionConsumer.of( p->{ int year = p.getYear(); String stationId = p.getStationId(); int temp = p.getAirTemperature(); YearStation ys = new YearStation(year+"",stationId); context.write(ys,new IntWritable(temp)); } )); } } public static class HTDReducer extends Reducer<YearStation,IntWritable,YearStationTempDB,NullWritable>{ @Override protected void reduce(YearStation key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { Stream<IntWritable> stream = StreamSupport.stream(values.spliterator(), false); Integer max = stream.map(s -> s.get()).reduce(0, (x, y) -> Math.max(x, y)); int y = Integer.parseInt(key.getYear().toString()); YearStationTempDB yst = new YearStationTempDB(y,key.getStationid().toString(),max); context.write(yst,NullWritable.get()); } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf,"hdfstodb_xj"); job.setJarByClass(this.getClass()); job.setMapperClass(HTDMapper.class); job.setMapOutputKeyClass(YearStation.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(HTDReducer.class); job.setOutputKeyClass(YearStationTempDB.class); job.setOutputValueClass(NullWritable.class); // 如何让集群加载jdbc驱动类 // 1 将jar放入share/hadoop/yarn/下会自动上传jar到集群 // 2 把jar放入集群中lib目录下,重启集群 // 3 job.addFileToClassPath(),需要把jar包上传到hdfs //job.addFileToClassPath(new Path("hdfs://172.16.0.4:9000/data/mysql-connector-java-5.1.38.jar")); // 连接数据库 DBConfiguration.configureDB(job.getConfiguration(),"com.mysql.jdbc.Driver", "jdbc:mysql://172.16.0.100:3306/hadoop","hadoop","hadoop"); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(DBOutputFormat.class); TextInputFormat.addInputPath(job,new Path(conf.get("inpath"))); DBOutputFormat.setOutput(job,"max_tmp_xj","year","station","temperature"); return job.waitForCompletion(true)? 0 : 1; } } |
输入类型-InputFormat
常见输入类型
-
TextInputFormat:按行获取字符串数据
1
2
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path(conf.get("inpath")));
-
CombineTextInputFormat:将多个输入文件压缩成一个文件,避免开启多个map
1
2
job.setInputFormatClass(CombineTextInputFormat.class);
CombineFileInputFormat.addInputPath(job,new Path(conf.get("inpath")));
-
KeyValueTextInputFormat:按key-value形式获取数据
1
2
3
conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ","); // 设置key-value的分割符,只识别第一个分隔符
job1.setInputFormatClass(KeyValueTextInputFormat.class);
KeyValueTextInputFormat.addInputPath(job1, new Path(conf.get("input")));
-
DBInputFormat:从数据库中获取数据
1
2
3
job.setInputFormatClass(DBInputFormat.class); // 设置为DB输入类型
// year = 2000是条件,表示输入year=2000的数据
DBInputFormat.setInput(job,YearStationTempDB.class,"station_tbl","year = 2000","","year","station","temperature");
自定义输入类型
思路分析:
- 创建一个解析类继承RecordReader
- 在解析类中完成获取数据的逻辑
- 创建一个自定义输入类型的类继承FileInputFormat
- 在自定义输入类型的类中重写方法createRecordReader()
- 在该方法中创建解析类的对象并调用initialize()方法进行初始化,最后返回该对象。
- 完成,可在其它类中调用该自定义输入类型
解析类-YearStationRecordReader
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | public class YearStationRecordReader extends RecordReader<YearStation, IntWritable> { private LineRecordReader reader = new LineRecordReader(); private NcdcRecordParser parser = new NcdcRecordParser(); private YearStation ys = new YearStation(); private IntWritable tmp = new IntWritable(); @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException { reader.initialize(inputSplit,taskAttemptContext); } @Override public boolean nextKeyValue() throws IOException { do { // 判断是否有下一个值 if (!reader.nextKeyValue()) { return false; } // 获取并解析当前值 Text line = reader.getCurrentValue(); parser.parse(line.toString()); }while (!parser.isValidTemperature()); // 如果气温返回值为false则继续循环下一个 int year = parser.getYear(); int tmp = parser.getAirTemperature(); String station = parser.getStationId(); ys.setYear(new Text(year+"")); ys.setStationid(new Text(station)); this.tmp.set(tmp); return true; } @Override public YearStation getCurrentKey() throws IOException, InterruptedException { return this.ys; } @Override public IntWritable getCurrentValue() throws IOException, InterruptedException { return this.tmp; } @Override public float getProgress() throws IOException, InterruptedException { return reader.getProgress(); } @Override public void close() throws IOException { reader.close(); } } |
自定义输入类型类-YearStationInputFormat
1 2 3 4 5 6 7 8 9 10 | // 利用FileInputFormat数据分片功能,实现自定义输入类型 public class YearStationInputFormat extends FileInputFormat<YearStation, IntWritable> { @Override public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException { YearStationRecordReader reader = new YearStationRecordReader(); reader.initialize(inputSplit,taskAttemptContext); return reader; } } |
复合类型-YearStation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | public class YearStation implements WritableComparable<YearStation> { private Text year = new Text(); // 年份 private Text stationid = new Text(); //气象站id public YearStation() { } public YearStation(Text year, Text stationid) { this.year = new Text(year.toString()); this.stationid = new Text(stationid.toString()); } public YearStation(String year, String stationid) { this.year = new Text(year); this.stationid = new Text(stationid); } @Override public int compareTo(YearStation o) { return this.year.compareTo(o.year)==0 ? this.stationid.compareTo(o.stationid) : this.year.compareTo(o.year); } @Override public void write(DataOutput dataOutput) throws IOException { year.write(dataOutput); stationid.write(dataOutput); } @Override public void readFields(DataInput dataInput) throws IOException { year.readFields(dataInput); stationid.readFields(dataInput); } public Text getYear() { return year; } public void setYear(Text year) { this.year = new Text(year.toString()); } public Text getStationid() { return stationid; } public void setStationid(Text stationid) { this.stationid = new Text(stationid.toString()); } @Override public String toString() { return year.toString()+"\t"+stationid.toString(); } } |
测试自定义类型-MaxTmpByYearStationMR
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | public class MaxTmpByYearStationMR extends Configured implements Tool { public static class MTBYSMapper extends Mapper<YearStation, IntWritable, YearStation,IntWritable> { @Override protected void map(YearStation key, IntWritable value, Context context) throws IOException, InterruptedException { context.write(key,value); } } public static class MTBYSReducer extends Reducer<YearStation, IntWritable, Text, IntWritable> { @Override protected void reduce(YearStation key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { Optional<Integer> max = StreamSupport.stream(values.spliterator(), false) .map(e -> e.get()).reduce((x, y) -> Math.max(x, y)); context.write(new Text(key.toString()), new IntWritable(max.get())); } } @Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf, "MaxTmpByYS_xj"); job.setJarByClass(this.getClass()); job.setMapperClass(MTBYSMapper.class); job.setMapOutputKeyClass(YearStation.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(MTBYSReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(YearStationInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); YearStationInputFormat.addInputPath(job,new Path(conf.get("inpath"))); TextOutputFormat.setOutputPath(job,new Path(conf.get("outpath"))); return job.waitForCompletion(true)? 0 : 1; } public static void main(String[] args) throws Exception{ ToolRunner.run(new MaxTmpByYearStationMR(),args); } } |
JobControl
简述:
如果MapReduce中需要用到多个job,而且多个job之间需要设置一些依赖关系,比如Job3需要依赖于Job2,Job2依赖于Job1,这就要用到JobControl。
代码实例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | Job getSDJob = Job.getInstance(conf, "get_sd_job_xj"); getSDJob.setJarByClass(GetSimilarityDegree.class); // 3 为任务装配mapper类 getSDJob.setMapperClass(GetSimilarityDegree.GSDMapper.class); getSDJob.setMapOutputKeyClass(Text.class); getSDJob.setMapOutputValueClass(DoubleWritable.class); // 5 配置数据输入路径 TextInputFormat.addInputPath(getSDJob, new Path("src/train_bin")); // 6 配置结果输出路径 TextOutputFormat.setOutputPath(getSDJob, new Path("src/name_sd")); Job sortBySDJob = Job.getInstance(conf, "sortBySDJob"); sortBySDJob.setJarByClass(SortedByDegree.class); // 3 为任务装配mapper类 sortBySDJob.setMapperClass(SortedByDegree.SBDMapper.class); sortBySDJob.setMapOutputKeyClass(TagDegree.class); sortBySDJob.setMapOutputValueClass(NullWritable.class); // 4 为任务装配reducer类 sortBySDJob.setReducerClass(SortedByDegree.SBDReducer.class); sortBySDJob.setOutputKeyClass(Text.class); sortBySDJob.setOutputValueClass(DoubleWritable.class); // 5 配置数据输入路径 TextInputFormat.addInputPath(sortBySDJob, new Path("src/name_sd")); // 6 配置结果输出路径 TextOutputFormat.setOutputPath(sortBySDJob, new Path("src/name_sd_sorted")); Job getFKJob = Job.getInstance(conf, "getFKJob"); getFKJob.setJarByClass(GetFirstK.class); // 3 为任务装配mapper类 getFKJob.setMapperClass(GetFirstK.GFKMapper.class); getFKJob.setMapOutputKeyClass(TagDegree.class); getFKJob.setMapOutputValueClass(IntWritable.class); // 4 为任务装配reducer类 getFKJob.setReducerClass(GetFirstK.GFKReducer.class); getFKJob.setOutputKeyClass(Text.class); getFKJob.setOutputValueClass(Text.class); // 5 配置数据输入路径 TextInputFormat.addInputPath(getFKJob, new Path("src/name_sd_sorted")); // 6 配置结果输出路径 TextOutputFormat.setOutputPath(getFKJob, new Path("src/gfk_res")); getFKJob.setGroupingComparatorClass(GFKGroupComparator.class); Job getLRJob = Job.getInstance(conf, "getLRJob"); getLRJob.setJarByClass(GetLastResult.class); // 3 为任务装配mapper类 getLRJob.setMapperClass(GetLastResult.GLRMapper.class); getLRJob.setMapOutputKeyClass(Text.class); getLRJob.setMapOutputValueClass(TagAvgNum.class); // 4 为任务装配reducer类 getLRJob.setReducerClass(GetLastResult.GLRReducer.class); getLRJob.setOutputKeyClass(Text.class); getLRJob.setOutputValueClass(NullWritable.class); // 5 配置数据输入路径 TextInputFormat.addInputPath(getLRJob, new Path("src/gfk_res")); // 6 配置结果输出路径 TextOutputFormat.setOutputPath(getLRJob, new Path("src/last_res")); ControlledJob getSD = new ControlledJob(getSDJob.getConfiguration()); ControlledJob sortBySD = new ControlledJob(sortBySDJob.getConfiguration()); ControlledJob getFK = new ControlledJob(getFKJob.getConfiguration()); ControlledJob getLR = new ControlledJob(getLRJob.getConfiguration()); // 添加依赖 getLR.addDependingJob(getFK); getFK.addDependingJob(sortBySD); sortBySD.addDependingJob(getSD); JobControl con = new JobControl("test"); con.addJob(getSD); con.addJob(sortBySD); con.addJob(getFK); con.addJob(getLR); Thread t = new Thread(con); t.start(); while (true) { if (con.allFinished()) { System.out.println("图片识别完毕,请查看结果"); System.exit(0); } } |
最后更新: 2018年10月08日 18:25
原始链接: https://www.lousenjay.top/2018/09/03/MapReduce入门详解(三)/