mapreduce的核心思想就是:map读入一条记录,然后重置其key 、value;reduce针对map的同一条key,把其values整合,然后重新输出key、value;
这两天做mapreduce时,忽然有种想法:针对一行记录,是否有输出多行记录,包含有不同到key,value,的情况;这种情况到应用是否有好处呢?
假设有这样的一个数据:其中第一列代表用户的标识,第二列代表书籍的标识
1,A
1,B
1,C
2,A
2,B
3,B
3,C
4,A
4,E
5,C
5,E
1,A---》2,4;E,B
1,B---》3,2;A,C
。。。
那现在看下,我们可以如何操作呢?很直观的两种想法:一,以用户标识作为key,书籍标识作为value,那么经过mapreduce后,我们得到到数据如下:
1---》A,B,C
2---》A,B
。。。
或者以书籍标识作为key ,以用户标识作为value,那么我们得到的数据如下:
A--》1,2,4
B--》1,2,3
.。。
这两种情况都很难做到我们要的输出,那么如何做呢?就用文章开始说过的方法:一条记录产生多条key/value:
下面就具体代码进行分析:
第一个Mapper:
- package org.fansy.data907;
- import java.io.IOException;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- /*
- * first map input: 1,A
- * 1,A
- * 1,B
- * reduce output: 1 /t A,B,C,
- */
- public class BookM extends Mapper<LongWritable,Text,Text,Text>{
- public void map(LongWritable key,Text line,Context context)throws IOException,InterruptedException{
- String[] values=line.toString().split(",");
- if(values.length!=2){
- return;
- }
- String userid=values[0];
- String bookid=values[1];
- context.write(new Text(userid), new Text(bookid));
- }
- }
第一个Reducer:
- package org.fansy.data907;
- import java.io.IOException;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class BookR extends Reducer<Text,Text,Text,Text>{
- public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,
- InterruptedException{
- StringBuffer sb=new StringBuffer();
- for(Text val:values){
- sb.append(val.toString());
- sb.append(",");
- }
- context.write(key, new Text(sb.toString()));
- }
- }
1 A,B,C,
2 A,B,
3 B,C,
4 A,E,
5 C,E,
接下来是第二个M/R:
第二个Mapper:
- package org.fansy.data907;
- import java.io.IOException;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- public class BookMM extends Mapper<LongWritable,Text,Text,Text>{
- public void map(LongWritable key,Text line,Context context)throws IOException,InterruptedException{
- String[] values=line.toString().split("\t");
- if(values.length!=2){
- return;
- }
- String userid=values[0];
- String bookid=values[1];
- String[] books=bookid.split(",");
- String fulluser=userid+":"+bookid;
- for(int i=0;i<books.length;i++){
- // every book and full userinfo
- context.write(new Text(books[i]), new Text(fulluser));
- }
- }
- }
- package org.fansy.data907;
- import java.io.IOException;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class BookRR extends Reducer<Text,Text,Text,Text>{
- public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,
- InterruptedException{
- StringBuffer sb=new StringBuffer();
- for(Text val:values){
- sb.append(val.toString());
- sb.append("|");
- }
- context.write(key, new Text(sb.toString()));
- }
- }
A--》1:A,B,C
B--》1:A,B,C
C--》1:A,B,C
A--》2:A,B
B--》2:A,B
。。。
特别要注意这里的针对一条记录产生了多条key/value,比如:针对:1--》A,B,C,产生了:
A--》1:A,B,C
B--》1:A,B,C
C--》1:A,B,C
这样的三条记录,这里是为了reduce操作做准备的,那么reduce操作就把每个book标识的所有用户的信息全部输出了,如下:
A 1:A,B,C,|2:A,B,|4:A,E,|
B 1:A,B,C,|2:A,B,|3:B,C,|
C 1:A,B,C,|3:B,C,|5:C,E,|
E 4:A,E,|5:C,E,|
接下来第三组M/R:
第三个Mapper:
- package org.fansy.data907;
- import java.io.IOException;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Mapper;
- public class BookMMM extends Mapper<LongWritable,Text,Text,Text>{
- public void map(LongWritable key,Text line,Context context)throws IOException,InterruptedException{
- String[] values=line.toString().split("\t");
- if(values.length!=2){
- return;
- }
- String bookid=values[0];
- String fulluserinfo=values[1];
- String[] fullusersinfo=fulluserinfo.split("\\|");
- // data: A 1:A,B,C,|2:A,B,|4:A,E,|
- // fullusersinfo = 1:A,B,C,|2:A,B,|4:A,E,|
- // out is 1,A 1:A,B,C,|2:A,B,|4:A,E,|
- // 2,A 1:A,B,C,|2:A,B,|4:A,E,|
- for(int i=0;i<fullusersinfo.length;i++){
- // every book and full userinfo
- String[] singleuser=fullusersinfo[i].split(":");
- String newkey=singleuser[0]+","+bookid; // newkey = 1,A
- context.write(new Text(newkey), line);
- }
- }
- }
- package org.fansy.data907;
- import java.io.IOException;
- import java.util.HashSet;
- import java.util.Iterator;
- import java.util.Set;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Reducer;
- public class BookRRR extends Reducer<Text,Text,Text,Text>{
- public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,
- InterruptedException{
- // input is 1,A 1:A,B,C,|2:A,B,|4:A,E,|
- // 2,A 1:A,B,C,|2:A,B,|4:A,E,|
- Set<String> allotherusers=new HashSet<String>();
- Set<String> otheruserbooks=new HashSet<String>();
- String[] keystr=key.toString().split(",");
- String useridkey=keystr[0]; // the userid in key
- Iterator<Text> iter=values.iterator();
- String[] tempval= iter.next().toString().split("\t");
- if(tempval.length!=2){
- return;
- }
- String val=tempval[1];
- // String val=values.toString(); //val= 1:A,B,C,|2:A,B,|4:A,E,|
- String[] usersinfo=val.toString().split("\\|"); // usersinfo={"1:A,B,C,","2:A,B,","4:A,E,"}
- for(int i=0;i<usersinfo.length;i++){
- String useridvalue=usersinfo[i].split(":")[0]; // usersinfo[0]= 1:A,B,C,
- String[] books=usersinfo[i].split(":")[1].split(","); // books=A,B,C
- int songsnum=books.length;
- if(useridkey.equals(useridvalue)){
- ;
- }else{
- allotherusers.add(useridvalue); // add a user
- for(int j=0;j<songsnum;j++){
- otheruserbooks.add(books[j]);
- }
- }
- }
- // get rid of the keystr[a] both in currentusersongs and otherusersongs
- // there can ignore contains method?
- if(otheruserbooks.contains(keystr[1])){
- otheruserbooks.remove(keystr[1]);
- }
- // get the users and books
- Iterator<String> currentiter=allotherusers.iterator();
- Iterator<String>otheriter=otheruserbooks.iterator();
- StringBuffer sb1=new StringBuffer();
- // StringBuffer sb2=new StringBuffer();
- while(currentiter.hasNext()){
- sb1.append(currentiter.next());
- sb1.append(",");
- }
- sb1.append(":");
- while(otheriter.hasNext()){
- sb1.append(otheriter.next());
- sb1.append(",");
- }
- context.write(key, new Text(sb1.toString()));
- }
- }
1,A 1:A,B,C,|2:A,B,|4:A,E,|
2,A 1:A,B,C,|2:A,B,|4:A,E,|
4,A 1:A,B,C,|2:A,B,|4:A,E,|
这个阶段的输出如下:
1,A 2,4,:E,B,
1,B 3,2,:A,C,
1,C 3,5,:E,B,
2,A 1,4,:E,B,C,
2,B 3,1,:A,C,
3,B 2,1,:A,C,
3,C 1,5,:E,A,B,
4,A 2,1,:B,C,
4,E 5,:C,
5,C 3,1,:A,B,
5,E 4,:A,
第三个阶段的reduce任务不需要把相同到key放入同一个组内,因为第三个map输出没有相同的Key,接下来的操作就是简单的把结果分离,得到我们想要到结果。
任务调度的代码如下:
- package org.fansy.data907;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class Book {
- /**
- * use this to solve the book problem
- * first map input: 1,A
- * 1,A
- * 1,B
- * reduce output: 1 /t A,B,C,
- * @throws IOException
- * @throws ClassNotFoundException
- * @throws InterruptedException
- */
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- // TODO Auto-generated method stub
- // job one configuration
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
- if (otherArgs.length != 2) {
- System.err.println("Usage: Book <in> <out>");
- System.exit(2);
- }
- Job job = new Job(conf, "book job one");
- job.setJarByClass(Book.class);
- job.setMapperClass(BookM.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Text.class);
- job.setReducerClass(BookR.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/user/fansy/out/out05"));
- if(!job.waitForCompletion(true)){
- System.exit(1); // run error then exit
- }
- /*
- * next map and reduce
- * map out:
- * A 1:A,B,C
- * B 1:A,B,C
- * ...
- * B 2:B,C
- * reduce out:
- * A 1:A,B,C,|2:A,B,|4:A,E,|
- B 1:A,B,C,|2:A,B,|3:B,C,|
- C 1:A,B,C,|3:B,C,|5:C,E,|
- E 4:A,E,|5:C,E,|
- * ...
- */
- Configuration conf2 = new Configuration();
- Job job2 = new Job(conf2, "book job two");
- job2.setJarByClass(Book.class);
- job2.setMapperClass(BookMM.class);
- job2.setMapOutputKeyClass(Text.class);
- job2.setMapOutputValueClass(Text.class);
- job2.setReducerClass(BookRR.class);
- job2.setOutputKeyClass(Text.class);
- job2.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job2, new Path("hdfs://localhost:9000/user/fansy/out/out05/part*"));
- FileOutputFormat.setOutputPath(job2, new Path("hdfs://localhost:9000/user/fansy/out/out06"));
- if(!job2.waitForCompletion(true)){
- System.exit(1); // run error then exit
- }
- /*
- * next map out:
- * 1A 1:A,B,C,|3:A,C,|4:A,E,|
- * 3A 1:A,B,C,|3:A,C,|4:A,E,|
- * 4A 1:A,B,C,|3:A,C,|4:A,E,|
- * 1B 1:A,B,C,|2:A,B,|3:B,C,|
- * 2B 1:A,B,C,|2:A,B,|3:B,C,|
- * ...
- * reduce :get the users and books
- *
- */
- Configuration conf3 = new Configuration();
- Job job3 = new Job(conf3, "book job three ");
- job3.setJarByClass(Book.class);
- job3.setMapperClass(BookMMM.class);
- job3.setMapOutputKeyClass(Text.class);
- job3.setMapOutputValueClass(Text.class);
- job3.setReducerClass(BookRRR.class);
- job3.setOutputKeyClass(Text.class);
- job3.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job3, new Path("hdfs://localhost:9000/user/fansy/out/out06/part*"));
- FileOutputFormat.setOutputPath(job3, new Path(otherArgs[1]));
- System.exit(job3.waitForCompletion(true) ? 0 : 1);
- }
- }