案例一:倒排索引案例(多job串联)
-
需求
有大量的文本,需要建立搜索索引
-
数据输入
-
文件1
E:\work\test\input\II\a.txt
Remilya Scarlet jiejie Frandre Scarlet meimei Scarlet
-
文件2
E:\work\test\input\II\b.txt
Remilya Scarlet weiyan weiyan weiyan weiyan weiyan weiyan weiyan weiyan weiyan weiyan weiyan weiyan jiejie jiejie jiejie
-
文件3
E:\work\test\input\II\c.txt
Frandre Scarlet meimei meimei meimei meimei Scarlet Scarlet
-
-
期望输出
-
每一个单词为输出的键,其在每个文件中出现的次数与文件名连接后全部拼接为值
Frandre a.txt---1,c.txt---1, Remilya a.txt---1,b.txt---1, Scarlet c.txt---3,b.txt---1,a.txt---3, jiejie b.txt---3,a.txt---1, meimei a.txt---1,c.txt---4, weiyan b.txt---12,c.txt---1,a.txt---2,
-
-
-
代码实现
mapper1:
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; /** * 进行稍微有所变动的wordCount即可 */ public class Mapper1 extends Mapper<LongWritable, Text,Text, IntWritable> { private String fileName; /** * 获取文件名 */ @Override protected void setup(Context context) throws IOException, InterruptedException { FileSplit split = (FileSplit) context.getInputSplit(); fileName = split.getPath().getName(); } private Text k_out = new Text(); private IntWritable v_out = new IntWritable(1); /** * 只要切割输入字符串,将每个单词后拼接上文件名即可作为键输出 */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] words = value.toString().split(" "); for (String word : words) { k_out.set(word+"-"+fileName); context.write(k_out,v_out); } } }
reducer1
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * 累加即可 */ public class Reducer1 extends Reducer<Text, IntWritable,Text,IntWritable> { private IntWritable v_out = new IntWritable(); /** * 累加即可 */ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } v_out.set(sum); context.write(key,v_out); } }
mapper2
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * 采用k-v输入格式,通过"-"切割k-v */ public class Mapper2 extends Mapper<Text, Text, Text, Text> { /** * map直接用父类实现直接写出也可以 * 这里修改一下value的连接符,不影响业务 */ private Text v_out = new Text(); @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { //修改value连接符 String[] v = value.toString().split("\t"); v_out.set(v[0]+"---"+v[1]); context.write(key,v_out); } }
reducer2
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * 拼接同一个键的值 * 使达到预期效果 */ public class Reducer2 extends Reducer<Text, Text, Text, Text> { /** * 拼接值 */ private Text v_out = new Text(); @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuffer sb = new StringBuffer(); boolean first = true; for (Text value : values) { sb.append(value.toString()+","); } v_out.set(sb.toString()); context.write(key,v_out); } }
driver
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob; import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * 在driver中设置 */ public class MyDriver { public static void main(String[] args) throws IOException, InterruptedException { //设置输入输出路径 Path input = new Path("e:/work/test/input/II"); Path output1 = new Path("e:/work/test/output1"); Path output2 = new Path("e:/work/test/output2"); //两份配置文件 Configuration conf1 = new Configuration(); Configuration conf2 = new Configuration(); //设置job2的k-vinputformat的分隔符 conf2.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator","-"); //删除可能存在的输出路径 FileSystem fs = FileSystem.get(conf1); if(fs.exists(output1)){ fs.delete(output1,true); } if(fs.exists(output2)){ fs.delete(output2,true); } //获取俩job Job job1 = Job.getInstance(conf1); Job job2 = Job.getInstance(conf2); /**************************************************/ //设置job1的信息 job1.setMapperClass(Mapper1.class); job1.setReducerClass(Reducer1.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job1,input); FileOutputFormat.setOutputPath(job1,output1); /**************************************************/ //设置job2的信息 job2.setMapperClass(Mapper2.class); job2.setReducerClass(Reducer2.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setInputFormatClass(KeyValueTextInputFormat.class); FileInputFormat.setInputPaths(job2,output1); FileOutputFormat.setOutputPath(job2,output2); /**************************************************/ //设置job名称 job1.setJobName("job1"); job2.setJobName("job2"); //将两个任务关联起来 JobControl jobControl = new JobControl("jobs"); //创建controlledJob,保证两个job的同步关系 ControlledJob controlledJob1 = new ControlledJob(job1.getConfiguration()); ControlledJob controlledJob2 = new ControlledJob(job2.getConfiguration()); //将两个job连接,保证controlledJob2在controlledJob1之后运行 controlledJob2.addDependingJob(controlledJob1); jobControl.addJob(controlledJob1); jobControl.addJob(controlledJob2); //运行job Control1 Thread jobControlThread = new Thread(jobControl); //设置此线程为守护线程 jobControlThread.setDaemon(true); jobControlThread.start(); while (true){ if(jobControl.allFinished()){ return; } } } }
案例二:TOPN案例
-
需求
-
已知输入数据为
E:\work\test\input\top10\top10inputfile.txt
13470253144 180 180 360 13509468723 7335 110349 117684 13560439638 918 4938 5856 13568436656 3597 25635 29232 13590439668 1116 954 2070 13630577991 6960 690 7650 13682846555 1938 2910 4848 13729199489 240 0 240 13736230513 2481 24681 27162 13768778790 120 120 240 13846544121 264 0 264 13956435636 132 1512 1644 13966251146 240 0 240 13975057813 11058 48243 59301 13992314666 3008 3720 6728 15043685818 3659 3538 7197 15910133277 3156 2936 6092 15959002129 1938 180 2118 18271575951 1527 2106 3633 18390173782 9531 2412 11943 84188413 4116 1432 5548
每列分别代表着手机号、上行、下行、总流量
现要求使用流量最多的top10
-
采用降序排序,输出时使用一个count计数器只输出前十的数据即可
-
-
代码
FlowBean(setter、getter生成器略
import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; /** * FlowBean实现WritableComparable接口,使得其可排序 */ public class FlowBean implements WritableComparable { private long upFlow; private long downFlow; private long sumFlow; /** * 实现compareTo方法实现排序 */ public int compareTo(Object o) { FlowBean bean = (FlowBean) o; if(sumFlow > bean.getSumFlow()){ return -1; }else if (sumFlow == bean.getSumFlow()){ return 0; }else{ return 1; } } public void write(DataOutput out) throws IOException { out.writeLong(upFlow); out.writeLong(downFlow); out.writeLong(sumFlow); } public void readFields(DataInput in) throws IOException { upFlow = in.readLong(); downFlow = in.readLong(); sumFlow = in.readLong(); } @Override public String toString() { return upFlow+"\t"+downFlow+"\t"+sumFlow; } }
mapper
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * 将可排序的FlowBean直接作为键输出 * 值为其手机号 */ public class TopMapper extends Mapper<LongWritable, Text,FlowBean, Text> { private FlowBean k_out = new FlowBean(); private Text v_out = new Text(); /** * map方法将流量数据封装到FlowBean中 * 然后写出k-v, * 由于FlowBean中实现了compareTo方法,所以回按照总流量排序 */ @Override protected void map( LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] line = value.toString().split("\t"); v_out.set(line[0]); k_out.setUpFlow(Long.parseLong(line[1])); k_out.setDownFlow(Long.parseLong(line[2])); k_out.setSumFlow(Long.parseLong(line[3])); context.write(k_out,v_out); } }
redcuer
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * reduce类中申请一个成员变量count,当count为10后不再输出数据即可 * 如果需要解决并列问题,修改count计数在循环和判断语句间的位置即可 */ public class TopReducer extends Reducer<FlowBean, Text,Text,FlowBean> { private int count; @Override protected void setup(Context context) throws IOException, InterruptedException { count = 0; } @Override protected void reduce( FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //遍历以免出现总流量相同的情况 for (Text value : values) { if(count < 10){ context.write(value,key); count++; } } } }
driver设置略
-
运行结果:
13509468723 7335 110349 117684 13975057813 11058 48243 59301 13568436656 3597 25635 29232 13736230513 2481 24681 27162 18390173782 9531 2412 11943 13630577991 6960 690 7650 15043685818 3659 3538 7197 13992314666 3008 3720 6728 15910133277 3156 2936 6092 13560439638 918 4938 5856
符合预期,案例完成
案例三:计算共同好友
-
需求
-
有如下好友数据,冒号前为该用户,冒号后为该用户的好友。此处的好友为单向好友
-
输入数据
E:\work\test\input\friends\friends.txt
A:B,C,D,F,E,O B:A,C,E,K C:F,A,D,I D:A,E,F,L E:B,C,D,M,L F:A,B,C,D,E,O,M G:A,C,D,E,F H:A,C,D,E,O I:A,O J:B,O K:A,C,D L:D,E,F M:E,F,G O:A,H,I,J
要求求出两两相对用户的共同好友,预期输出数据为:
A-B E C A-C D F A-D E F A-E D B C A-F O B C D E A-G F E C D A-H E C D O A-I O A-J O B A-K D C A-L F E D A-M E F B-C A B-D A E B-E C B-F E A C B-G C E A B-H A E C B-I A B-K C A B-L E B-M E B-O A C-D A F C-E D C-F D A C-G D F A C-H D A C-I A C-K A D C-L D F C-M F C-O I A D-E L D-F A E D-G E A F D-H A E D-I A D-K A D-L E F D-M F E D-O A E-F D M C B E-G C D E-H C D E-J B E-K C D E-L D F-G D C A E F-H A D O E C F-I O A F-J B O F-K D C A F-L E D F-M E F-O A G-H D C E A G-I A G-K D A C G-L D F E G-M E F G-O A H-I O A H-J O H-K A C D H-L D E H-M E H-O A I-J O I-K A I-O A K-L D K-O A L-M E F
-
-
思路
- 思路:
- 首先分割数据,建立好友-用户的k-v对,然后合并为好友-用户们输出
- 然后再在作为值得用户们中两两配对,输出。
- 最后归并数据即可
- 思路:
-
代码
mapper1
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * mapper1负责将用户:好友数据切割 * 并以好友-用户形式得k-v对输出 */ public class Mapper1 extends Mapper<LongWritable, Text, Text, Text> { private Text k_out = new Text(); private Text v_out = new Text(); @Override protected void map( LongWritable key, Text value, Context context) throws IOException, InterruptedException { //切割行 String[] line = value.toString().split(":"); //切割键的部分 String[] friends = line[1].split(","); k_out.set(line[0]); for (String friend : friends) { v_out.set(friend.toString()); context.write(k_out,v_out); } } }
reducer1
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * reducer1负责把mapper1分割处理好的数据进行归并, */ public class Reducer1 extends Reducer<Text, Text, Text, Text> { private Text v_out = new Text(); @Override protected void reduce( Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String temp = ""; //连接“好友”数据对应的所有用户 for (Text value : values) { temp += value.toString() + ","; } v_out.set(temp); //输出 context.write(key,v_out); } }
mapper2
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.Arrays; /** * 同样使用kvinputformat * k为好友,v为对应的所有用户,由\t分割 * 此步将k对应的所有v两两对应 */ public class Mapper2 extends Mapper<LongWritable, Text, Text, Text> { private Text k_out = new Text(); private Text v_out = new Text(); @Override protected void map( LongWritable key, Text value, Context context) throws IOException, InterruptedException { //切割行 String[] line = value.toString().split("\t"); //设置value v_out.set(line[0]); String[] users = line[1].split(","); //使用户变得有序,方便后续组合不会出现因为次序颠倒而两两相悖的情况 Arrays.sort(users); for (int i = 0; i < users.length; i++) { for (int j = i+1; j < users.length; j++) { //两两拼接 k_out.set(users[i]+"-"+users[j]); context.write(k_out,v_out); } } } }
reducer2
import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * 负责最终的数据颠倒写出 */ public class Reducer2 extends Reducer<Text, Text, Text, NullWritable> { private Text k_out = new Text(); @Override protected void reduce( Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String temp = key.toString() + " "; for (Text value : values) { temp += value.toString() + " "; } k_out.set(temp); context.write(k_out,NullWritable.get()); } }
driver
*略
-
输出结果:
A-B F A-C F B K G H A-D F G H C K A-E B F G H D A-F G C D A-H O A-I O C A-J O A-K B A-L D A-M F A-O I F H B-C A F E B-D F E A B-E F A B-F A B-L E B-M E F B-O A F J C-D E G A K F H C-E B A F H G C-F G A C-K B C-L E C-M E F C-O F A H D-E F L H A G D-F G A L C D-I C D-L E D-M E F D-O A F H E-F M L D G A E-G M E-K B E-L D E-M F E-O A F H F-G M F-I C F-L D F-O A H-I O H-J O I-J O L-M E M-O F
符合预期