MapReduce经典案例总结

最新推荐文章于 2024-05-14 01:12:01 发布

finbarr45

最新推荐文章于 2024-05-14 01:12:01 发布

阅读量2.5k

点赞数 2

分类专栏： hadoop 文章标签： hadoop mapreduce

本文链接：https://blog.csdn.net/qq_40016949/article/details/114599389

版权

hadoop 专栏收录该内容

6 篇文章

订阅专栏

MapReduce经典案例总结

首先得搭好hadoop环境，windows搭好单机环境

1、根据手机号统计上网流量，统计上行流量、下行流量和总流量

数据如下,文件 flow.log，上传到hadoop集群目录 /flow/input，经过一次mapper，Reducer完成统计，mapper默认使用TextInput，每次读取文件一行内容，用手机号作为key，上行、下行、总流量封装到bean中，map阶段输出bean，把同一个手机号的上网流量汇总在一起，合并排序。放到reducer阶段，reducer阶段只需要把同一个手机号产生的流量相加便得到总流量。

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

java代码如下

package com.skymesh.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;

import java.io.IOException;
import java.util.Iterator;

/**
 * 流量统计经典题目
*/
public class FlowMapReduce {

    static  class FlowMapper extends Mapper<LongWritable, Text,Text,FlowBean> {

        Text t = new Text();
        FlowBean bean=new FlowBean();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] str = value.toString().split("\t");
            if (str.length>3){
                String phone = str[1];
                long upFlow = Long.parseLong(str[str.length - 3]);
                long downFlow = Long.parseLong(str[str.length - 2]);
                t.set(phone);
                bean.setDownFlow(downFlow);
                bean.setUpFlow(upFlow);
                //文件很多行会创建很多对象，此处可以使用一个对象和一个地址是由于对象直接被序列化，数据被写入了文件
                //context.write(new Text(phone),new FlowBean(upFlow, downFlow));
                context.write(t,bean);
            }
        }
    }

    static class FlowReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
        FlowBean bean=new FlowBean();
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
            long sumUpFlow=0;
            long sumDFlow=0;
            Iterator<FlowBean> iterator = values.iterator();
            while (iterator.hasNext()){
                FlowBean flowBean = iterator.next();
                sumUpFlow = sumUpFlow +flowBean.getUpFlow();
                sumDFlow = sumDFlow + flowBean.getDownFlow();
            }
            bean.setUpFlow(sumUpFlow);
            bean.setDownFlow(sumDFlow);
            context.write(key,bean);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //指定mapreduce执行在哪个环境
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://192.168.136.128:9000");
        conf.set("yarn.resoucemanager.hostname", "hadoop1");
        Job job = Job.getInstance(conf);
        job.setJarByClass(FlowMapReduce.class);
        //指定mapper和reducer的执行
        job.setMapperClass(FlowMapper.class);
        job.setReducerClass(FlowReducer.class);
        //设置返回键值类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);
        //设置输入输出文件系统所在的位置
        FileInputFormat.setInputPaths(job,new Path("/flow/input"));
        FileOutputFormat.setOutputPath(job,new Path("/flow/output"));
        //将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
        /*job.submit();*/
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}


//---------------------------bean-----------------------//
package com.skymesh.mapreduce;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 流量统计
 *
 */
public class FlowBean implements WritableComparable<FlowBean> {

    private long upFlow;
    private long downFlow;
    private long sumFlow;

    //反序列化时，需要反射调用空参构造函数，所以要显示定义一个
    public FlowBean(){}

    public FlowBean(long upFlow, long dFlow) {
        this.upFlow = upFlow;
        this.downFlow = dFlow;
        this.sumFlow = upFlow + dFlow;
    }
    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public void setDownFlow(long downFlow) {
        this.downFlow = downFlow;
    }

    public long getUpFlow() {
        return upFlow;
    }

    public long getDownFlow() {
        return downFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeLong(upFlow);
        dataOutput.writeLong(downFlow);
        dataOutput.writeLong(sumFlow);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
         upFlow = dataInput.readLong();
         downFlow = dataInput.readLong();
         sumFlow = dataInput.readLong();
    }

    @Override
    public String toString() {
        return "FlowBean{" +
                "upFlow=" + upFlow +
                ", downFlow=" + downFlow +
                ", sumFlow=" + sumFlow +
                '}';
    }


    @Override
    public int compareTo(FlowBean o) {

        return this.sumFlow>o.getSumFlow()?-1:1;
    }
}

2、大数据表关联，传统关系型数据无法存放大量数据

原始数据模型，需要把用户表和部门表命名成user*.txt 和 dept*.txt，并且传到hadoop集群 /table/input中

/**
 * mapreduce 实现多表关联
 * sys_user
 * userId userName age deptId
 * 10 zhangsan	18 1000
 * 11 lisi 19 1001
 * 12 wangwu 24 1000
 * 13 lier 23 1001
 * 14 jd 32 1001
 *
 * sys_dept
 * deptId deptName layerCode
 * 1000 综合部 001
 * 1001 网络安全 002
 */

java代码

package com.skymesh.mapreduce;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * mapreduce 实现多表关联
 * sys_user
 * userId userName age deptId
 * 10 zhangsan	18 1000
 *  * 11 lisi 19 1001
 *  * 12 wangwu 24 1000
 *  * 13 lier 23 1001
 *  * 14 jd 32 1001
 *
 * sys_dept
 * deptId deptName layerCode
 * 1000 综合部 001
 * 1001 网络安全 002
 */
public class TableJoinMapReduce {
    
    static class  TableJoinMapper extends Mapper<LongWritable, Text,Text,TbaleJoinBean>{

        TbaleJoinBean bean = new TbaleJoinBean();
        Text text = new Text();

            @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            FileSplit fileSplit = (FileSplit) context.getInputSplit();
            String fileNname = fileSplit.getPath().getName();
            String[] fileds = line.split(" ");
            //读取的是用户表 userId userName age deptId
            if (fileNname.contains("user")){
                text.set(fileds[3]);
                bean.setUserId(fileds[0]);
                bean.setUserName(fileds[1]);
                bean.setAge(Integer.parseInt(fileds[2]));
                bean.setDeptId(fileds[3]);
                bean.setDeptName("");
                bean.setLayerCode("");
                bean.setFlag("user");
                context.write(text,bean);
            }else {//读取的是部门表 deptId deptName layerCode
                text.set(fileds[0]);
                bean.setUserId("");
                bean.setUserName("");
                bean.setAge(0);
                bean.setDeptId(fileds[0]);
                bean.setDeptName(fileds[1]);
                bean.setLayerCode(fileds[2]);
                bean.setFlag("dept");
                context.write(text,bean);
            }
        }
    }

    static class TableJoinReducer extends Reducer<Text, TbaleJoinBean,Text,TbaleJoinBean>{

        @Override
        protected void reduce(Text key, Iterable<TbaleJoinBean> values, Context context) throws IOException, InterruptedException {
            Iterator<TbaleJoinBean> iterator = values.iterator();
            TbaleJoinBean deptBean = new TbaleJoinBean();
            List<TbaleJoinBean> userList = new ArrayList<TbaleJoinBean>();
            while (iterator.hasNext()){
                TbaleJoinBean bean = iterator.next();
                if ("user".equals(bean.getFlag())){ //用户
                    TbaleJoinBean userBean = new TbaleJoinBean();
                    try {
                        BeanUtils.copyProperties(userBean,bean);
                        userList.add(userBean);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }else { //部门
                    try {
                        BeanUtils.copyProperties(deptBean,bean);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }

            for (TbaleJoinBean bean: userList){
                bean.setDeptName(deptBean.getDeptName());
                bean.setLayerCode(deptBean.getLayerCode());
                context.write(key,bean);
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://192.168.136.128:9000");
        conf.set("yarn.resoucemanager.hostname", "hadoop1");
        Job job = Job.getInstance(conf);
        job.setMapOutputValueClass(TbaleJoinBean.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapperClass(TableJoinMapper.class);
        job.setReducerClass(TableJoinReducer.class);
        job.setJarByClass(TableJoinMapReduce.class);
        job.setUser("root");
        FileInputFormat.setInputPaths(job,new Path("/table/input"));
        FileOutputFormat.setOutputPath(job,new Path("/table/output"));
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}





//----------------------------bean------------------------------//
package com.skymesh.mapreduce;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TbaleJoinBean implements Writable {

    private String userId ;
    private String userName;

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getUserName() {
        return userName;
    }

    public void setUserName(String userName) {
        this.userName = userName;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    public String getDeptName() {
        return deptName;
    }

    public void setDeptName(String deptName) {
        this.deptName = deptName;
    }

    public String getDeptId() {
        return deptId;
    }

    public void setDeptId(String deptId) {
        this.deptId = deptId;
    }

    public String getLayerCode() {
        return layerCode;
    }

    public void setLayerCode(String layerCode) {
        this.layerCode = layerCode;
    }

    private int age;
    private String deptName;
    private String deptId;
    private String layerCode;

    private String flag;

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return
                "userId='" + userId + '\'' +
                ", userName='" + userName + '\'' +
                ", age=" + age +
                ", deptName='" + deptName + '\'' +
                ", deptId='" + deptId + '\'' +
                ", layerCode='" + layerCode + '\''+
                ", flag='"+flag+'\'';
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(userId);
        dataOutput.writeUTF(userName);
        dataOutput.writeUTF(deptName);
        dataOutput.writeUTF(deptId);
        dataOutput.writeUTF(layerCode);
        dataOutput.writeInt(age);
        dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        userId=dataInput.readUTF();
        userName=dataInput.readUTF();
        deptName=dataInput.readUTF();
        deptId= dataInput.readUTF();
        layerCode =dataInput.readUTF();
        age = dataInput.readInt();
        flag=dataInput.readUTF();
    }
}

3、共同朋友计算

原始数据

/**
 * 经典问题共同朋友解决
 * A:B,C,D,F,E,O
 * B:A,C,E,K
 * C:F,A,D,I
 * D:A,E,F,L
 * E:B,C,D,M,L
 * F:A,B,C,D,E,O,M
 * G:A,C,D,E,F
 * H:A,C,D,E,O
 * I:A,O
 * J:B,O
 * K:A,C,D
 * L:D,E,F
 * M:E,F,G
 * O:A,H,I,J
 *
 * 求出两两之间有共同好友的"用户对"，及他俩的共同好友
 * 比如:
 * a-b :  c ,e
 */

java代码

package com.skymesh.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Arrays;

/**
 * 经典问题共同朋友解决
 * A:B,C,D,F,E,O
 * B:A,C,E,K
 * C:F,A,D,I
 * D:A,E,F,L
 * E:B,C,D,M,L
 * F:A,B,C,D,E,O,M
 * G:A,C,D,E,F
 * H:A,C,D,E,O
 * I:A,O
 * J:B,O
 * K:A,C,D
 * L:D,E,F
 * M:E,F,G
 * O:A,H,I,J
 *
 * 求出两两之间有共同好友的"用户对"，及他俩的共同好友
 * 比如:
 * a-b :  c ,e
 */
public class SameFriendMapReduce {

    static class FriendMapper extends Mapper<LongWritable, Text,Text,Text>{

        @Override
        protected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {
            String line = value.toString();
            String[] friend_users = line.split(":");
            String friend = friend_users[0];
            String[] users = friend_users[1].split(",");
            Arrays.sort(users);
            for (int i = 0; i < users.length-1; i++) {
                for (int j = i+1; j < users.length; j++) {
                    context.write(new Text(users[i]+"-"+users[j]), new Text(friend));
                }
            }
        }
    }

    static class FriendReducer extends Reducer<Text,Text,Text,Text>{

        @Override
        protected void reduce(Text friend, Iterable<Text> users, Context context) throws IOException, InterruptedException {
            StringBuffer buf = new StringBuffer();
            for (Text user : users) {
                buf.append(user).append(",");
            }
            context.write(new Text(friend), new Text(buf.toString()));
        }
    }

    public static void main(String[] args) throws Exception {
        //指定mapreduce执行在哪个环境
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://192.168.136.128:9000");
        conf.set("yarn.resoucemanager.hostname", "hadoop1");
        conf.set("mapreduce.reduce.maxattempts","3");
        Job job = Job.getInstance(conf);
        job.setJarByClass(SameFriendMapReduce.class);
        //指定mapper和reducer的执行
        job.setMapperClass(FriendMapper.class);
        job.setReducerClass(FriendReducer.class);
        //设置返回键值类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //设置输入输出文件系统所在的位置
        FileInputFormat.setInputPaths(job,new Path("/friend/input"));
        FileOutputFormat.setOutputPath(job,new Path("/friend/output"));
        //将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
        /*job.submit();*/
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}