hadoop入门之mapreduce终结篇-Mapreduce案例(六)

16 篇文章 0 订阅
1 篇文章 0 订阅

标签(空格分隔): hadoop


简介

本节主要是针对hdfs在业务中的日常应用而讲解的一些案例,用于训练我们在使用hdfs的方式。(ps:mr程序的本质是根据规则做数据拆分,之后根据key做好reduce的分组操作)

1 案例

数据存放: 链接:https://pan.baidu.com/s/1nsAcNdWE_glFqyx4AJ-GVg
提取码:lkdr

1.1 使用mr实现 join

1.数据准备:join主要包含两方面的数据,班级和学员信息
班级信息

班级id班级
11班
22班

学员信息
10,yifang,15,1 学员id 学员名称 年龄 所属班级id

2.案例分析
需要将此两种信息汇总到一起,我们如何实现join呢,我们需要控制好key的流转即可,因为reduce本身会根据key做好分组,如果我们控制班级id的key作为1组的话,那么就可以实现学员信息和班级信息的join。
3.code
可去 github mr/join下查看代码

package com.lcy.hadoop.mr.join;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.lcy.hadoop.mr.flowsum.FlowBean;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */
public class JoinDriver {

    static class JoinMapper extends     Mapper<LongWritable,Text,IntWritable,JoinBean>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] splictValuse = value.toString().split(",");
        //通过文件名称来判断该文件时class还是student,这个信息在构建mr之前会放入到context中
        FileSplit splict = (FileSplit)context.getInputSplit();
        String name = splict.getPath().getName();
        JoinBean joinBean = null;
        int cid;
        if(name.contains("class")){
            //如果是class则设置class相关信
            joinBean = new JoinBean(-1,"",-1,Integer.valueOf(splictValuse[0]),splictValuse[1],true);
            cid = Integer.valueOf(splictValuse[0]);
        }else{
            //这部分是student的信息
            joinBean = new JoinBean(Integer.valueOf(splictValuse[0]),splictValuse[1],Integer.valueOf(splictValuse[2]),Integer.valueOf(splictValuse[3]),"",false);
            cid = Integer.valueOf(splictValuse[3]);
        }
        context.write(new IntWritable(cid),joinBean);
    }
}
//reducer主要做数据join之后的输出操作
static class JoinReducer extends Reducer<IntWritable,JoinBean,JoinBean,NullWritable>{
    @Override
    protected void reduce(IntWritable key, Iterable<JoinBean> joinBeans, Context context) throws IOException, InterruptedException {
        //先找出对应的class和student做区分
        JoinBean classBean = new JoinBean();
        List<JoinBean> studentBenas = new ArrayList<>();
        try {
            for(JoinBean bean:joinBeans){
                //这里是挨个序列化,所以bean实际上以最后一个bean的数据会覆盖前面的需要坐下拷贝
                if(bean.isClassFlas()){
                    BeanUtils.copyProperties(classBean,bean);
                }else{
                    JoinBean sBean = new JoinBean();
                    BeanUtils.copyProperties(sBean,bean);
                    studentBenas.add(sBean);
                }
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        for(JoinBean bean:studentBenas){
            bean.setCName(classBean.getCName());
            context.write(bean,NullWritable.get());
        }
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(JoinDriver.class);

    job.setMapperClass(JoinMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(JoinBean.class);

    job.setReducerClass(JoinReducer.class);
    job.setOutputKeyClass(FlowBean.class);
    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(job,new Path(args[0]));
    deleteFIle(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}

private static void deleteFIle(String arg) {
    File file = new File(arg);
    if(file.exists()){
        if(file.isDirectory()){
            String[] files = file.list();
            for(String f : files){
                File fi = new File(file.getParent(),f);
                fi.delete();
            }
            file.delete();
        }else{
            file.delete();
        }
    }
}

}

JoinBean
package com.lcy.hadoop.mr.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import lombok.Data;
import org.apache.hadoop.io.Writable;

/**

  • Created by luo on 2019/6/2.
    */
    @Data
    public class JoinBean implements Writable {
    //学员id 学员名称 年龄 所属班级id
    private int sId;
    private String sName;
    private int sAge;
    private int cId; //班级id
    private String cName;//班级名称
    private boolean isClassFlas;

    public JoinBean(int sId, String sName, int sAge, int cId,String cName,boolean isClassFlas) {
    this.sId = sId;
    this.sName = sName;
    this.sAge = sAge;
    this.cId = cId;
    this.isClassFlas = isClassFlas;
    this.cName = cName;

    }

    public JoinBean() {
    }

    @Override
    public void write(DataOutput output) throws IOException {
    output.writeInt(sId);
    output.writeUTF(sName);
    output.writeInt(sAge);
    output.writeInt(cId);
    output.writeUTF(cName);
    output.writeBoolean(isClassFlas);
    }

    @Override
    public void readFields(DataInput input) throws IOException {
    this.sId = input.readInt();
    this.sName = input.readUTF();
    this.sAge = input.readInt();
    this.cId = input.readInt();
    this.cName = input.readUTF();
    this.isClassFlas = input.readBoolean();
    }

    @Override
    public String toString() {
    return “” + sId + ‘\t’ +
    sName + ‘\t’ +
    sAge + ‘\t’ +
    cId + ‘\t’ +
    cName;
    }
    }

1.2 数据倾斜处理思路之去除map

1.数据准备:数据还是刚刚那个数据
2.案例分析:现在假如这个以上方join程序为例子,出现了学生信息很多的情况,我们会进行适当的reduce数量配置,而这时候可能就会出现hash之后的数据在reduce出现某些reducetask处理数据集特别多,而另一部分的数据集特别少的情况下,那么我们怎样才能够解决这种情况呢。(ps:假设在这上面的情况就是班级信息有限,而学生信息无限的情况下),思路即使假如我们可以让map端直接实现join那么是不是就不用怕reduce端数据倾斜呢。而map端在启用的时候都会调用setup之后获取数据调用map最后调用cleanup,我们在setup上面将班级信息做加载,那就能完成我们这个针对数据倾斜的方案。

package com.lcy.hadoop.mr.mapjoin;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import com.lcy.hadoop.mr.join.JoinBean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */
public class MapJoinDriver {

static class MapJoinMapper extends Mapper<LongWritable,Text,JoinBean,NullWritable> {
    Map<Integer,String> classMap = new HashMap();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
  BufferedReader reader = null;
            try {
                reader = new BufferedReader(new InputStreamReader(new FileInputStream("join_class.txt")));
                String line = null;
                String[] valuse;
                while ((line = reader.readLine())!=null){
                    valuse = line.split(",");
                    classMap.put(Integer.valueOf(valuse[0]),valuse[1]);
                }

            }finally {
                reader.close();
            }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] splictValuse = value.toString().split(",");
        //通过文件名称来判断该文件时class还是student,这个信息在构建mr之前会放入到context中
        FileSplit splict = (FileSplit)context.getInputSplit();
        String name = splict.getPath().getName();
        JoinBean joinBean = null;
        int cid;
        if(name.contains("student")){
            joinBean = new JoinBean(Integer.valueOf(splictValuse[0]),splictValuse[1],Integer.valueOf(splictValuse[2]),Integer.valueOf(splictValuse[3]),"",false);
            cid = Integer.valueOf(splictValuse[3]);
            joinBean.setCName(classMap.get(joinBean.getCId()));
            context.write(joinBean,NullWritable.get());
        }

    }
}



public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(MapJoinDriver.class);
    job.addCacheFile(new URI("file:/E:/mr/join/input/join_class.txt"));
    job.setMapperClass(MapJoinMapper.class);
    job.setMapOutputKeyClass(JoinBean.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputKeyClass(JoinBean.class);
    job.setOutputValueClass(NullWritable.class);
    job.setNumReduceTasks(0);
    FileInputFormat.setInputPaths(job,new Path(args[0]));
    deleteFIle(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));
    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}

private static void deleteFIle(String arg) {
    File file = new File(arg);
    if(file.exists()){
        if(file.isDirectory()){
            String[] files = file.list();
            for(String f : files){
                File fi = new File(file.getParent(),f);
                fi.delete();
            }
            file.delete();
        }else{
            file.delete();
        }
    }
}

}

1.3 求两两之间的共同好友

1.数据准备:
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

求出哪些人两两之间有共同好友,及他俩的共同好友都是谁
比如:
a-b : c ,e
2.案例分析
这道题的思路有很多,我将一下我这边的解题思路。1)先找出所有以当前人为共同好友的所有人选,之后起第二个mr 对共同好友进行排序(因为两两共同好友a-b:c 和b-a:c 是一致的所以我们需要统一处理下),之后输出两人为key,对应的共同好友人为value,在reduce端做下两人拥有的共同好友的所有人输出处理。

3.code (这个案例的代码放在mr/friend里面)
第一步:获取以key为共同好友的所有人

package com.lcy.hadoop.mr.friend;

import java.io.IOException;

import com.lcy.hadoop.mr.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */
public class FriendOneDriver {

//A:b,c,d a有bcd几个好友,我们其实要找的是b的好友有谁c的好友有谁
static class FriendOneMapper extends Mapper<LongWritable,Text,Text,Text> {
    Text fValue  = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        System.out.println(value);
        String[] splictValuse = value.toString().split(":");
        String keyStr = splictValuse[0];
        //获取所有以此为好友的
        String[] frieds = splictValuse[1].split(",");
        fValue.set(keyStr);
        for(String s:frieds){
            context.write(new Text(s),fValue);
        }
    }
}
//
static class FriendOneReducer extends Reducer<Text,Text,Text,Text> {
    /**
     * 现在获取到的就是以key为共同好友的一组人员数据,我们先求出key的共同好友都有谁
     * @param key
     * @param values
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        for(Text v:values){
            sb.append(v.toString()).append(",");
        }
        context.write(key,new Text(sb.toString()));
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(FriendOneDriver.class);

    job.setMapperClass(FriendOneMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(FriendOneReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job,new Path(args[0]));
    FileUtils.deleteFile(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}
}

2.第二步 获取所有的两两拥有的共同好友

package com.lcy.hadoop.mr.friend;

import java.io.IOException;
import java.util.Arrays;

import com.lcy.hadoop.mr.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created by luo on 2019/6/2.
 */

public class FriendTwoDriver {

//将key为公共好友的 两两组成一队,value以当前的共同好友为value
static class FriendTwoMapper extends Mapper<LongWritable,Text,Text,Text> {
    Text fValue  = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] splicts = value.toString().split("\t");
        fValue.set(splicts[0]);
        String[] commomFs = splicts[1].split(",");
        Arrays.sort(commomFs);
        String twoPerson = null;
        for(int i = 0;i<commomFs.length-1;i++){
            for(int j = i +1;j<commomFs.length;j++){
                twoPerson = commomFs[i] + "--" + commomFs[j];
                context.write(new Text(twoPerson),fValue);
            }
        }
    }
}
//
static class FriendTwoReducer extends Reducer<Text,Text,Text,Text> {
    /**
     * 现在获取到的就是以key为共同好友的一组人员数据,我们先求出key的共同好友都有谁
     * @param key
     * @param values
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        for(Text v:values){
            sb.append(v.toString()).append(",");
        }
        String result = sb.toString();
        if(sb.length()>1){
            result = sb.substring(0,sb.length()-1);
        }
        context.write(key,new Text(result));
    }
}

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    conf.set("mapreduce.framework.name","local");
    //操作本地可不设置任何参数,如果以yarn的形式去提交任务则需要设置执行框架为yarn并且需要配置文件系统为hdfs
    Job job = Job.getInstance(conf);
    job.setJarByClass(FriendTwoDriver.class);

    job.setMapperClass(FriendTwoMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(FriendTwoReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setCombinerClass(FriendTwoReducer.class);//由于不影响最终结果所以这里以combiner可以设置,可以提升reduce效率
    FileInputFormat.setInputPaths(job,new Path(args[0]));
    FileUtils.deleteFile(args[1]);
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    boolean isSuccess = job.waitForCompletion(true);
    System.exit(isSuccess?0:1);
}
}

2 总结

我觉得mr程序主要的难点就在于对key的定位,如果对key的定位好了,统计分析程序用一个到2个的mr就能实现。所以我们在做mr程序的时候一定要考虑好用什么作为key。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值