MapReduce框架运用及案例分析（详解）---------------资料分享在最后的链接

最新推荐文章于 2024-08-17 21:28:57 发布

longG_It

最新推荐文章于 2024-08-17 21:28:57 发布

阅读量1.1k

点赞数

分类专栏： MapReduce

本文链接：https://blog.csdn.net/qq_41166135/article/details/81986902

版权

MapReduce 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

所有的实例都在本地进行，启动使用上篇文章的第三种方式
1.电影评分的平均值（所用文件rating.json）
原始数据:{“movie”:”1193”,”rate”:”5”,”timeStamp”:”978300760”,”uid”:”1”}
结果显示：1000 3
1002 4
1003 2
1004 2
1005 2
思路：利用JSON转换工具将数据封装为对象，方便去使用，在Map阶段，将movie和rate作为key、value值，在Reduce阶段将movie和平均评分作为key、value值
(1)封装代码块

public class MovieBean {
private String movie;
private int rate;
private String timeStamp;
private String uid;
public String getMovie() {
    return movie;
}
public void setMovie(String movie) {
    this.movie = movie;
}
public int getRate() {
    return rate;
}
public void setRate(int rate) {
    this.rate = rate;
}
public String getTimeStamp() {
    return timeStamp;
}
public void setTimeStamp(String timeStamp) {
    this.timeStamp = timeStamp;
}
public String getUid() {
    return uid;
}
public void setUid(String uid) {
    this.uid = uid;
}
@Override
public String toString() {
    return "MovieBean [movie=" + movie + ", rate=" + rate + ", timeStamp=" + timeStamp + ", uid=" + uid + "]";
}

（2）具体的map、reduce实现


import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;


public class Avg {
public static class MapTask extends Mapper<LongWritable, Text, Text, IntWritable>   {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        ObjectMapper objectMapper = new ObjectMapper();//使用ObjectMapper类进行json的转换
        MovieBean bean = objectMapper.readValue(value.toString(), MovieBean.class);
        context.write(new Text(bean.getMovie()), new IntWritable(bean.getRate()));  //读每一行数据，将电影和评分找出来
    }
}

public static class ReduceTask extends Reducer<Text, IntWritable, Text, IntWritable>{
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
            Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        int sum = 0;
        int count = 0;
        for (IntWritable intWritable : values) {
            sum += intWritable.get();//进行评分的求和
            count++;                 //进行电影个数的统计
        }
        context.write(new Text(key), new IntWritable(sum/count));//求出平均评分

    }
}


public static void main(String[] args) {
    try {
        //System.setProperty("HADOOP_USER_NAME", "SIMPLE");
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        // 设置map和reduce，以及提交的jar
        job.setMapperClass(MapTask.class);
        job.setReducerClass(ReduceTask.class);
        job.setJarByClass(Avg.class);

        // 设置输入输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 判断文件是否存在
        File file = new File("d:\\data\\out\\movie");
        if (file.exists()) {
            FileUtils.deleteDirectory(file);
        }
        // 输入和输出目录
        FileInputFormat.addInputPath(job, new Path("D:\\data\\in\\movie\\rating.json"));
        FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\movie"));

        // 提交任务
        boolean completion = job.waitForCompletion(true);
        System.out.println(completion ? "你很优秀！！！" : "滚去调bug！！");

    } catch (Exception e) {
        e.printStackTrace();
        // TODO: handle exception
    }

}
}

2.求两个用户之间的共同好友
原始数据：A:B,C,D,F,E,O
B:A,C,E,K
A和B的共同好友是C和E
结果：第一个mapreduce:B-C A B-D A
第二个mapreduce:A-B E C A-C D F
思路：1.因为每个用户的共同好友利用mapreduce不好实现，所以反过来求好友的用户
这样就能求出来好友有哪些用户，两两组合起来用户就是用户的共同好友
2.根据第一个结果将结果的Value值两两组合起来就OK

（1）

public class Compile {
  public static class MapTask extends Mapper<LongWritable, Text,Text, Text>{
      @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
            throws IOException, InterruptedException {
        String[] user = value.toString().split(":");//将用户和好友分开
        String[] friend = user[1].split(",");       //将好友分开
            for (String string1 : friend) {
                context.write(new Text(string1),new Text(user[0]) );
                                                   //key是好友，value是用户

            }
        }
  }
  public static class ReduceTask extends Reducer<Text, Text, Text, Text>{
      @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
            throws IOException, InterruptedException {
        //利用集合将两个用户进行组合，求出共同好友
          List<String> userList = new ArrayList<>();
          for (Text string : values) {
              userList.add(string.toString());

        }
          Collections.sort(userList); //需要排序是因为避免A-B/B-Akey值不相同的情况，排完序从前往后遍历就OK
          for(int i=0;i<userList.size()-1;i++) {
              for(int j=i+1;j<userList.size();j++) {
                  context.write(new Text(userList.get(i)+"-"+userList.get(j)), key);
              }
          }

    }
  }
  public static void main(String[] args) {
      try {

            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf);
            job.setMapperClass(MapTask.class);
            job.setReducerClass(ReduceTask.class);
            job.setJarByClass(Compile.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);

            FileSystem fs = FileSystem.get(conf);
            if(fs.exists(new Path("d:\\data\\out\\friend"))) {
                fs.delete(new Path("d:\\data\\out\\friend"),true);
            }

            FileInputFormat.addInputPath(job, new Path("e:\\data\\friend.txt"));
            FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\friend"));

            boolean completion = job.waitForCompletion(true);
            System.out.println(completion?"成功":"失败");
        }
        catch(Exception e) {

        }
        }

（2）将每两个用户的好友求出来,资料是friend


 class Compile2 {
     public static class MapTask extends Mapper<LongWritable, Text,Text, Text>{
         @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");//以tab分隔，将两个用户的共同好友传给reduce
            context.write(new Text(split[0]), new Text(split[1]));
      }
      public static class ReduceTask extends Reducer<Text, Text, Text, Text>{
          @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
              String friends="";
            for (Text text : values) {
                friends += text+" ";
            }
          context.write(new Text(key),new Text(friends));//将共同好友求出

            }


        }
      }
      public static void main(String[] args) {
          try {

                Configuration conf = new Configuration();

                Job job = Job.getInstance(conf);
                job.setMapperClass(MapTask.class);
                job.setReducerClass(ReduceTask.class);
                job.setJarByClass(Compile2.class);

                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(Text.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);

                FileSystem fs = FileSystem.get(conf);
                if(fs.exists(new Path("d:\\data\\out\\friend1"))) {
                    fs.delete(new Path("d:\\data\\out\\friend1"),true);
                }

                FileInputFormat.addInputPath(job, new Path("d:\\data\\out\\friend\\part-r-00000"));
                FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\friend1"));

                boolean completion = job.waitForCompletion(true);
                System.out.println(completion?"成功":"失败");
            }
            catch(Exception e) {

            }
            }

3.求出每个网站的上行流量、下行流量以及流量总和（自己定义Hadoop的序列化类）资料是DATA
原始数据
15639120688 http://v.baidu.com/movie 3936 12058
13905256439 http://movie.youku.com 10132 538
结果
blog.csdn.net FlowBean [up=239908231, down=238717280, sum=478625511]
image.baidu.com FlowBean [up=118511778, down=117759776, sum=236271554]
思路：将同一个网址作为key，流量作为value，当计算他们的流量总和，因为value涉及三个数据，不好管理，所以封装起来。
（1）`import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FlowBean implements Writable { //实现这个接口去增加hadoop的序列化类型
private long up;
private long down;
private long sum;

public void set(long up, long down) {
this.up = up;
this.down = down;
this.sum = up+down;
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public long getSum() {
return sum;
}
public void setSum(long sum) {
this.sum = sum;
}
@Override
public String toString() {
return “FlowBean [up=” + up + “, down=” + down + “, sum=” + sum + “]”;
}
**@Override //序列化与反序列化
public void readFields(DataInput in) throws IOException {
up = in.readLong();
down = in.readLong();
sum = in.readLong();

}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);

}`**
(2)MapReduce的实现

public class FlowMR {
    public static class MapTask extends Mapper<LongWritable, Text, Text, FlowBean> {
        public String reg(String url) {//为了实现截取中间的网址
            Pattern pattern = Pattern.compile("(\\w+\\.)?(\\w+\\.){1}\\w+");//传入正则表达式
            Matcher matcher = pattern.matcher(url); //匹配url
            while(matcher.find()){//找到匹配的话生成新的url
                String newUrl = matcher.group();
                return newUrl;
            }
            return null;


        }

        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context)
                throws IOException, InterruptedException {
            try {

                String[] split = value.toString().split("\t")[1].split(" ");
                long up = Long.parseLong(split[1]);
                long down = Long.parseLong(split[2]);
                String url = reg(split[0]);
                FlowBean fb = new FlowBean();
                fb.set(up, down);
                context.write(new Text(url), fb);//将每一行的url和计算好的流量传给reduce

            } catch (Exception e) {

            }
            // TODO: handle exception
        }
    }

    public static class ReduceTask extends Reducer<Text, FlowBean, Text, FlowBean> {

        @Override
        protected void reduce(Text key, Iterable<FlowBean> values,
                Reducer<Text, FlowBean, Text, FlowBean>.Context context) throws IOException, InterruptedException {
            long up = 0;
            long down = 0;
            FlowBean fb = new FlowBean();
            for (FlowBean flowBean : values) {
                up += flowBean.getUp();
                down += flowBean.getDown();//计算总的上行和下行流量
            }
            fb.set(up, down);
            context.write(key, fb);
        }
    }

    public static void main(String[] args) {
        try {
            //System.setProperty("HADOOP_USER_NAME", "SIMPLE");
            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf);

            // 设置map和reduce，以及提交的jar
            job.setMapperClass(MapTask.class);
            job.setReducerClass(ReduceTask.class);
            job.setJarByClass(FlowMR.class);

            // 设置输入输出类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(FlowBean.class);

            jo b.setOutputKeyClass(Text.class);
            job.setOutputValueClass(FlowBean.class);
            // 判断文件是否存在
            File file = new File("d:\\data\\out\\http");
            if (file.exists()) {
                FileUtils.deleteDirectory(file);
            }
            // 输入和输出目录
            FileInputFormat.addInputPath(job, new Path("E:/data/DATA.txt"));
            FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\http"));

            // 提交任务
            boolean completion = job.waitForCompletion(true);
            System.out.println(completion ? "你很优秀！！！" : "滚去调bug！！");

        } catch (Exception e) {
            e.printStackTrace();
            // TODO: handle exception
        }

    }
}

4.实现，每个关键词后面，显示所有包含关键词的文件的集合（获取每个词的文件名称）资料是index里面
初始数据：hello hello java c vb c#
hi xiaoming
hello honghong
结果数据：第一个MapReduce: am-b.txt 1
c#-a.txt 1
第二个MapReduce：c a.txt 1， b.txt 1
c# a.txt 1, b.txt 1
思路：首先将每一个文件中的词统计出来，以及在每个文件中的个数
第二个MapReduce将相同词的文件合并起来

（1）

public class CreateUndexOne {
    //  hello  hello hadoop  --------> hello-a.txt  1
    public static class MapTask extends Mapper<LongWritable, Text, Text, IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            **FileSplit fileSplit = (FileSplit)context.getInputSplit();
            String name = fileSplit.getPath().getName();**//可以获取关键词所在文件的名称
            String[] split = value.toString().split(" ");
            for (String string : split) {
                context.write(new Text(string + "-" +name), new IntWritable(1));//将每个文件的名称和其个数传给Reduce
            }
        }
    }

    public static class ReduceTask extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,
                Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int count = 0;
            for (IntWritable intWritable : values) {  //统计每个关键词的个数
                count++;
            }
            context.write(key, new IntWritable(count));
        }
    }

    public static void main(String[] args) {
        try {
            //System.setProperty("HADOOP_USER_NAME", "SIMPLE");
            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf);

            // 设置map和reduce，以及提交的jar
            job.setMapperClass(MapTask.class);
            job.setReducerClass(ReduceTask.class);
            job.setJarByClass(CreateUndexOne.class);

            // 设置输入输出类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            // 判断文件是否存在
            File file = new File("d:\\data\\out\\indexOne");
            if (file.exists()) {
                FileUtils.deleteDirectory(file);
            }
            // 输入和输出目录
            FileInputFormat.addInputPath(job, new Path("D:\\data\\in\\index"));
            FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\indexOne"));

            // 提交任务
            boolean completion = job.waitForCompletion(true);
            System.out.println(completion ? "你很优秀！！！" : "滚去调bug！！");

        } catch (Exception e) {
            e.printStackTrace();
            // TODO: handle exception
        }

    }


}

（2）

public class CreateUndexTwo {
  public static class MapTask extends Mapper<LongWritable, Text, Text, Text>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
    String[] split = value.toString().split("-");
    context.write(new Text(split[0]), new Text(split[1])); //将关键词和所属文件分开方便后边的文件合并
}
  }

  public static class ReduceTask extends Reducer<Text, Text, Text, Text>{
      @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
            throws IOException, InterruptedException {
          StringBuilder sb =new  StringBuilder();
          boolean flag=true;
        for (Text text : values) {//实现最后一个单词后面没有逗号
            if(flag) {
                sb.append(text.toString());
                flag=false;
            }
            else {
                sb.append(",");
                sb.append(text.toString());
            }
        }
        context.write(key, new Text(sb.toString()));  //实现文件的多个合并
    }
  }



  public static void main(String[] args) {
        try {
            //System.setProperty("HADOOP_USER_NAME", "SIMPLE");
            Configuration conf = new Configuration();

            Job job = Job.getInstance(conf);

            // 设置map和reduce，以及提交的jar
            job.setMapperClass(MapTask.class);
            job.setReducerClass(ReduceTask.class);
            job.setJarByClass(CreateUndexTwo.class);

            // 设置输入输出类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            // 判断文件是否存在
            File file = new File("d:\\data\\out\\indexTwo\\");
            if (file.exists()) {
                FileUtils.deleteDirectory(file);
            }
            // 输入和输出目录
            FileInputFormat.addInputPath(job, new Path("d:\\data\\out\\indexOne"));
            FileOutputFormat.setOutputPath(job, new Path("d:\\data\\out\\indexTwo\\"));

            // 提交任务
            boolean completion = job.waitForCompletion(true);
            System.out.println(completion ? "你很优秀！！！" : "滚去调bug！！");

        } catch (Exception e) {
            e.printStackTrace();
            // TODO: handle exception
        }

    }
}
`