【大数据开发】MapReduce——MapReduce模板、求平均分、多文件输出、分区排序、求共同好友、单表连接、分组topN

最新推荐文章于 2022-07-29 11:02:25 发布

这个妹妹我见过

最新推荐文章于 2022-07-29 11:02:25 发布

阅读量314

点赞数

分类专栏： # MapReduce 文章标签： mapreduce

本文链接：https://blog.csdn.net/weixin_37090394/article/details/108186223

版权

MapReduce 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

准备工作

依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.qfedu.bigdata</groupId>
    <artifactId>NZ2002Demo</artifactId>
    <version>1.0</version>

    <dependencies>
    <!-- jdk依赖 -->
    <dependency>
        <groupId>jdk.tools</groupId>
        <artifactId>jdk.tools</artifactId>
        <version>1.8.0</version>
        <scope>system</scope>
        <systemPath>${env.JAVA_HOME}/lib/tools.jar</systemPath>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.7.6</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.7.6</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>2.7.6</version>
    </dependency>

        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>  <!-- 默认是1.5,不能写成1.8.0 -->
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

log4j.properties

#log4j.logger.org.springframework=OFF  
#log4j.logger.com.opensymphony.xwork2=OFF  
#log4j.logger.com.mybatis=OFF  

log4j.rootLogger=INFO,stdout,warn,error,info,druid
#log4j.rootLogger=INFO,stdout,warn,error,info
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout.ConversionPattern=[%c] %5p - %m%n

#error
log4j.appender.error=org.apache.log4j.RollingFileAppender
log4j.appender.error.File= logs/tz-dms-error.log
log4j.appender.error.MaxBackupIndex=10
log4j.appender.error.MaxFileSize=512KB
log4j.appender.error.Threshold=ERROR
log4j.appender.error.layout=org.apache.log4j.PatternLayout
log4j.appender.error.layout.ConversionPattern=%d - [%l] %5p [%c] - %m%n

#warn
log4j.appender.warn=org.apache.log4j.RollingFileAppender
log4j.appender.warn.File= logs/tz-dms-warn.log
log4j.appender.warn.MaxBackupIndex=10
log4j.appender.warn.MaxFileSize=1024KB
log4j.appender.warn.Threshold=WARN
log4j.appender.warn.layout=org.apache.log4j.PatternLayout
log4j.appender.warn.layout.ConversionPattern=%d %5p [%c] - %m%n

#info
log4j.appender.info=org.apache.log4j.RollingFileAppender
log4j.appender.info.File= logs/tz-dms-info.log
log4j.appender.info.MaxBackupIndex=10
log4j.appender.info.MaxFileSize=1024KB
log4j.appender.info.Threshold=INFO
log4j.appender.info.layout=org.apache.log4j.PatternLayout
log4j.appender.info.layout.ConversionPattern=%d %5p [%c] - %m%n
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout

#debug
log4j.appender.debug=org.apache.log4j.RollingFileAppender
log4j.appender.debug.File= logs/tz-dms-debug.log
log4j.appender.debug.MaxBackupIndex=10
log4j.appender.debug.MaxFileSize=1024KB
log4j.appender.debug.Threshold=DEBUG
log4j.appender.debug.layout=org.apache.log4j.PatternLayout
log4j.appender.debug.layout.ConversionPattern=%d %5p [%c] - %m%n

###\u663E\u793ASQL\u8BED\u53E5\u90E8\u5206
#log4j.logger.java.sql.Connection=DEBUG
#log4j.logger.java.sql.Statement=DEBUG
#log4j.logger.java.sql.PreparedStatement=DEBUG

#druid
log4j.appender.druid= org.apache.log4j.DailyRollingFileAppender 
log4j.appender.druid.file= logs/tz-dms-druid.log
log4j.appender.druid.layout= org.apache.log4j.PatternLayout 
log4j.appender.druid.layout.ConversionPattern= [druid] %d [%-15.15t] %-5p %-30.30c {1} - %m%n   
log4j.appender.druid.DatePattern= yyyy-MM-dd'.log'
log4j.appender.druid.MaxFileSize=40MB
log4j.appender.druid.MaxBackupIndex=40
log4j.appender.druid.append= true
log4j.appender.druid.ImmediateFlush= true

log4j.logger.druid.sql=info,druid
log4j.logger.druid.sql.DataSource=info,druid
log4j.logger.druid.sql.Connection=info,druid
#debug可以看出所有的SQL
#log4j.logger.druid.sql.Statement=debug,druid  
#log4j.logger.druid.sql.ResultSet=debug,info,druid

log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.MaxBackupIndex=10
log4j.appender.logfile.MaxFileSize=1024KB
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR

MapReduce模板

import com.qfedu.bigdata.HdfsUtils.hdfsUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class model01 {
    static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{
        /**
         * 仅在map方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * 抽象出来的方法，通常每读取一行数据就调用一次map方法
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            super.map(key, value, context);
        }

        /**
         * 仅在map方法执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    static class MyReducer extends Reducer<Text,Text,Text,Text>{
        /**
         * 仅在reduce方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * map阶段输出多少个key，就会调用多少次reduce
         * 框架会按照key'相同的kv数据分成一组，然后每组数据调用一次reduce
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            super.reduce(key, values, context);
        }

        /**
         * 仅在reduce执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        //用户自定的配置

        //获取job对象
        Job job = Job.getInstance(conf, "model01");

        //配置job的参数
        job.setJarByClass(model01.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //
//        job.setCombinerClass();
//        job.setPartitionerClass();
//        job.setGroupingComparatorClass();
//        job.setNumReduceTasks();

        FileInputFormat.setInputPaths(job,new Path(args[0]));

        //先判断输出目录是否存在，如果存在则删除
//        FileSystem fs = FileSystem.get(conf);
        Path output = new Path(args[1]);

//        if (fs.exists(output)){
//            fs.delete(output,true);
//        }

        FileOutputFormat.setOutputPath(job,output);

        //提交job
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

hdfsUtil

package com.qfedu.bigdata.HdfsUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class hdfsUtil {
    private static final Logger logger =Logger.getLogger(hdfsUtil.class);

    private static final String CONNECTSTR = "fs.defaultFS";
    private static final String CONNECTVALUE = "hdfs://host01:9000";
    private static final String HADOOP_USER_NAME ="root";

    public static FileSystem getFS(){
        Configuration conf = new Configuration();

        conf.set(CONNECTSTR,CONNECTVALUE);

        FileSystem fs = null;
        try {
            fs = FileSystem.get(new URI(CONNECTVALUE),conf,HADOOP_USER_NAME);
        } catch (IOException e) {
            logger.error("获取客户端操作对象失败。",e);
        } catch (InterruptedException e) {
            logger.error("获取客户端操作对象失败。",e);
        } catch (URISyntaxException e) {
            logger.error("获取URI对象失败。",e);
        }

        return fs;
    }

    public static void closeFS(FileSystem fs){
        //判断传入参数
        if(fs == null ){
            return;
        }

        try {
            fs.close();
        } catch (IOException e) {
            // to do nothing
        }
    }
}

案例一、求平均分

文件一、chinese
zs01 60
zs02 62
zs03 64
zs04 66
zs05 68
zs06 70
zs07 72
zs08 74
zs09 76
zs10 78

文件二、english
zs01 70
zs02 73
zs03 76
zs04 79
zs05 82
zs06 85
zs07 88
zs08 91
zs09 94
zs10 97

文件三、math
zs01 80
zs02 81
zs03 82
zs04 83
zs05 84
zs06 85
zs07 86
zs08 87
zs09 88
zs10 89

代码：
package com.bgdata.mrDemo;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;


import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class avgDemo {
    public static class myMapper extends Mapper<LongWritable,Text, Text,Text>
    {
        private Text k = new Text();
        private Text v = new Text();

        protected void setup(Context context) {
            InputSplit inputSplit = context.getInputSplit();
            FileSplit fileSplit = (FileSplit) inputSplit;
            String name = fileSplit.getPath().getName();
            k.set(name);
        }

        protected void map(LongWritable key,Text values,Context context)throws IOException,InterruptedException{
            String line = values.toString();
            String[] str = line.split(" ");
            v.set(str[1]);

            context.write(k,v);
        }

        @Override
        protected void cleanup(Context context)throws IOException,InterruptedException
        {
            super.cleanup(context);
        }
    }


    public static class myReducer extends Reducer<Text,Text,Text,Text>
    {
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            double count = 0;
            double sum = 0;
            for (Text value : values) {
                String s = value.toString();
                double d = Double.parseDouble(s);
                sum+=d;
                count++;
            }
            double avg=sum/count;
            context.write(key,new Text(avg+""));
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf,"avgDemo");
        //配置job的参数
        job.setJarByClass(avgDemo.class);

        job.setMapperClass(myMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(myReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);


        FileInputFormat.setInputPaths(job,new Path("F:\\BgMaven\\bgdata\\wordCount\\subjectScore\\chinese"),
                new Path("F:\\BgMaven\\bgdata\\wordCount\\subjectScore\\english"),new Path("F:\\BgMaven\\bgdata\\wordCount\\subjectScore\\math"));
        FileOutputFormat.setOutputPath(job,new Path("F:\\BgMaven\\bgdata\\wordCount\\subjectScoreOutput"));

        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);

    }
}


结果：
chinese	69.0
english	83.5
math	84.5

案例二、多文件输出

单词首字母为a-z的放到一个文件中，统计 az
单词首字母为A-Z的放到一个文件中，统计 AZ
单词首字母为0-9的放到一个文件中 09
单词首字母为其他单词的放到一个文件中 other

文件：
hello world
Hello World
123nihao
QQ
163.com
@qq.com
(123)
&123
*abc
hi gaoyuanyuan
hello jiajingwen


代码：
package com.bgdata.mrDemo;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.StringTokenizer;

public class MultipleDemo {

    public static class myMapper extends Mapper<LongWritable,Text,Text,Text>
    {
        private Text k = new Text();
        private Text v = new Text("1");

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer st = new StringTokenizer(line);
            while(st.hasMoreElements())
            {
                k.set(st.nextToken());
                context.write(k,v);
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }


    public static class myReducer extends Reducer<Text,Text,Text,Text>
    {
        MultipleOutputs<Text,Text>  mos = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            mos = new MultipleOutputs<>(context);
        }

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (Text value : values) {
                sum += Integer.parseInt(value.toString());
            }

            String word = key.toString();
            String firstChar = word.substring(0,1);
            if(firstChar.matches("[a-z]"))
            {
                mos.write("az",key,new Text(sum + ""));
            }else if (firstChar.matches("[A-Z]"))
            {
                mos.write("azz",key,new Text(sum + ""));
            }else if (firstChar.matches("[0-9]"))
            {
                mos.write("09",key,new Text(sum + ""));
            }else
                mos.write("others",key,new Text(sum + ""));
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            mos.close();
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "MultioleDeno");
        job.setJarByClass(MultipleOutputs.class);

        job.setMapperClass(myMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(myReducer.class);
        job.setOutputKeyClass(Text.class);
       job.setOutputValueClass(Text.class);


        MultipleOutputs.addNamedOutput(job, "az", TextOutputFormat.class, Text.class, Text.class);
        MultipleOutputs.addNamedOutput(job, "azz", TextOutputFormat.class, Text.class, Text.class);
        MultipleOutputs.addNamedOutput(job, "09", TextOutputFormat.class, Text.class, Text.class);
        MultipleOutputs.addNamedOutput(job, "others", TextOutputFormat.class, Text.class, Text.class);

        FileInputFormat.setInputPaths(job,new Path("F:\\BgMaven\\bgdata\\wordCount\\multiFiles\\multiFile"));
        FileOutputFormat.setOutputPath(job,new Path("F:\\BgMaven\\bgdata\\wordCount\\multiFilesOutput"));

        System.exit(job.waitForCompletion(true)?0:1);
    }
}

az文件
gaoyuanyuan	1
hello	2
hi	1
jiajingwen	1
world	1

azz文件
Hello	1
QQ	1
World	1

09文件
123nihao	1
163.com	1

other文件
&123	1
(123)	1
*abc	1
@qq.com	1

案例三、将案例二中的元素进行分区排序

MyPartitioner.java

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 该类需要继承自Partitioner类
 * 分区的类型需要和map的输出kv类型一致
 * getPartition只能返回int类型
 * 分区数量需要和reduceTask的数量相等
 * 默认使用HashPartitioner
 * 分区返回值尽量用%模于方式做
 */
public class MyPartitioner extends Partitioner<Text,Text> {
    @Override
    public int getPartition(Text key, Text value, int numPartitions) {
        String firstCHar = key.toString().substring(0,1);

        int num=0;

        //判断
        if(firstCHar.matches("[a-z]")){
            num = 0;
        } else if (firstCHar.matches("[A-Z]")){
            num = 1;
        } else if (firstCHar.matches("[0-9]")){
            num = 2;
        } else {
            num = 3;
        }
        return num % numPartitions;
    }
}

MutipleDemo

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MultipleDemo {
    static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{
        private Text k = new Text();
        private Text v = new Text("1");
        /**
         * 仅在map方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * 抽象出来的方法，通常每读取一行数据就调用一次map方法
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] words = line.split(" ");

            for (String word:words) {
                k.set(word);
                context.write(k,v);
            }
        }

        /**
         * 仅在map方法执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    static class MyReducer extends Reducer<Text,Text,Text,Text>{
        /**
         * 仅在reduce方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * map阶段输出多少个key，就会调用多少次reduce
         * 框架会按照key'相同的kv数据分成一组，然后每组数据调用一次reduce
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            int count =0;

            for (Text v:values) {
                count+= Integer.parseInt(v.toString());
            }

            context.write(key,new Text(count + ""));
        }

        /**
         * 仅在reduce执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        //用户自定的配置

        //获取job对象
        Job job = Job.getInstance(conf, "model01");

        //配置job的参数
        job.setJarByClass(MultipleDemo.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //
//        job.setCombinerClass();
        job.setPartitionerClass(MyPartitioner.class);
//        job.setGroupingComparatorClass();
        job.setNumReduceTasks(2);


        FileInputFormat.setInputPaths(job,new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\multiFiles"));

        //先判断输出目录是否存在，如果存在则删除
//        FileSystem fs = FileSystem.get(conf);
        Path output = new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\multiFilesoutput2");

//        if (fs.exists(output)){
//            fs.delete(output,true);
//        }

        FileOutputFormat.setOutputPath(job,output);

        //提交job
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

案例四、求共同好友

data.txt

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

求出有共同好友的用户对，求出共同好友都是谁
A-B:C,E

A-C:D,F

思路：
如果能先求出好友都是哪些用户的好友，结果如下：
C A,B,E,F,G,H,K

则分析如下：

map的输出：
key：好友
value：用户

输入：A:B,C,D,F,E,O
B,A
C,A
D,A

REDUCE端的输入：好友
KEY：用户列表

map端的输入：
C	A,B,E,F,G,H,K

map端的输出：
key：用户的两两组合A-B|A-E|A-F|....
value：好友C

reduce端的输入：

shareFriendsStep1

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class shareFriendsStep1 {
    static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{
        private Text k = new Text();
        private Text v = new Text();
        /**
         * 仅在map方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * 抽象出来的方法，通常每读取一行数据就调用一次map方法
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] fields = line.split(":");

            String user = fields[0];
            String friends = fields[1];

            String[] split = friends.split(",");

            v.set(user);
            for (String friend :split) {
                k.set(friend);
                context.write(k,v);
            }
        }

        /**
         * 仅在map方法执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    static class MyReducer extends Reducer<Text,Text,Text,Text>{
        /**
         * 仅在reduce方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * map阶段输出多少个key，就会调用多少次reduce
         * 框架会按照key'相同的kv数据分成一组，然后每组数据调用一次reduce
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuffer sb = new StringBuffer();
            for (Text user:values) {
                sb.append(user.toString() + ",");
            }

            String users = sb.toString();
            users = users.substring(0,users.length() - 1);

            context.write(key,new Text(users));
        }

        /**
         * 仅在reduce执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        //用户自定的配置

        //获取job对象
        Job job = Job.getInstance(conf, "model01");

        //配置job的参数
        job.setJarByClass(shareFriendsStep1.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //
//        job.setCombinerClass();
//        job.setPartitionerClass();
//        job.setGroupingComparatorClass();
//        job.setNumReduceTasks();

        FileInputFormat.setInputPaths(job,new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\sharedFriends"));

        //先判断输出目录是否存在，如果存在则删除
//        FileSystem fs = FileSystem.get(conf);
        Path output = new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\sharedFriendsoutput1");

//        if (fs.exists(output)){
//            fs.delete(output,true);
//        }

        FileOutputFormat.setOutputPath(job,output);

        //提交job
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

shareFriendsStep2

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Arrays;

public class shareFriendsStep2 {
    static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{
        private Text k = new Text();
        private Text v = new Text();

        /**
         * 仅在map方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * 抽象出来的方法，通常每读取一行数据就调用一次map方法
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] fields = line.split("\t");

            String friend = fields[0];

            v.set(friend);

            String users = fields[1];

            String[] splits = users.split(",");

            Arrays.sort(splits);

            for (int i = 0; i < splits.length - 1; i++) {
                for (int j = i + 1; j < splits.length; j++) {
                    String userList = splits[i] + "-" + splits[j];

                    k.set(userList);

                    context.write(k,v);
                }
            }

        }

        /**
         * 仅在map方法执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    static class MyReducer extends Reducer<Text,Text,Text,Text>{
        /**
         * 仅在reduce方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * map阶段输出多少个key，就会调用多少次reduce
         * 框架会按照key'相同的kv数据分成一组，然后每组数据调用一次reduce
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            StringBuffer sb = new StringBuffer();

            for (Text friend :values) {
                sb.append(friend.toString() + ",");
            }

            String friends = sb.toString();
            friends = friends.substring(0,friends.length() - 1);

            context.write(key,new Text(friends));
        }

        /**
         * 仅在reduce执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        //用户自定的配置

        //获取job对象
        Job job = Job.getInstance(conf, "model01");

        //配置job的参数
        job.setJarByClass(shareFriendsStep2.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //
//        job.setCombinerClass();
//        job.setPartitionerClass();
//        job.setGroupingComparatorClass();
//        job.setNumReduceTasks();

        FileInputFormat.setInputPaths(job,new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\sharedFriendsoutput1"));

        //先判断输出目录是否存在，如果存在则删除
//        FileSystem fs = FileSystem.get(conf);
        Path output = new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\sharedFriendsoutput2");

//        if (fs.exists(output)){
//            fs.delete(output,true);
//        }

        FileOutputFormat.setOutputPath(job,output);

        //提交job
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

案例五、单表连接

singleJoin

张三 王阿姨
张三 张大叔
张张 王阿姨
张张 张大叔
王阿姨 王奶奶
王阿姨 王大爷
张大叔 张奶奶
张大叔 张大爷

singleTableJoin.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;

public class singleTableJoin {
    static class MyMapper extends Mapper<LongWritable, Text,Text,Text>{
        /**
         * 仅在map方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * 抽象出来的方法，通常每读取一行数据就调用一次map方法
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] fields = line.split(" ");

            String child = fields[0];
            String parent = fields[1];

            //输出数据
            //先写出左表的数据，key:parent value：relation_child 1
            context.write(new Text(parent),new Text("1_"+child));

            //写出右表的数据，key：child  value：relation_parent  2
            context.write(new Text(child),new Text("2_" + parent));
        }

        /**
         * 仅在map方法执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    static class MyReducer extends Reducer<Text,Text,Text,Text>{
        /**
         * 仅在reduce方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * map阶段输出多少个key，就会调用多少次reduce
         * 框架会按照key'相同的kv数据分成一组，然后每组数据调用一次reduce
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //定义一个集合来存放祖辈数据
            ArrayList<String> grandparent = new ArrayList<>();
            //定义一个集合来存放孙辈数据
            ArrayList<String> grandchild = new ArrayList<>();

            //迭代values，判断数据来源，并处分别存储数据
            for (Text v:values) {
                //判断关系类型
                String[] relations = v.toString().split("_");
                if (relations[0].equals("1")){
                    grandchild.add(relations[1]);
                } else {
                    grandparent.add(relations[1]);
                }
            }

            //使用笛卡尔积组合数据
            for (int i = 0; i < grandchild.size(); i++) {
                for (int j = 0; j < grandparent.size(); j++) {
                    context.write(new Text(grandchild.get(i)),new Text(grandparent.get(j)));
                }
            }
        }

        /**
         * 仅在reduce执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        //用户自定的配置

        //获取job对象
        Job job = Job.getInstance(conf, "model01");

        //配置job的参数
        job.setJarByClass(singleTableJoin.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //
//        job.setCombinerClass();
//        job.setPartitionerClass();
//        job.setGroupingComparatorClass();
//        job.setNumReduceTasks();

        FileInputFormat.setInputPaths(job,new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\singleJoin"));

        //先判断输出目录是否存在，如果存在则删除
//        FileSystem fs = FileSystem.get(conf);
        Path output = new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\singleJoinoutput");

//        if (fs.exists(output)){
//            fs.delete(output,true);
//        }

        FileOutputFormat.setOutputPath(job,output);

        //提交job
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

案例六、分组TOPN

1、自定义数据类型orderBean，要实现compareTo方法，按照orderId和金额倒序排序

2、自定义Partitioner，实现按照orderId分区

3、自定义GroupingComparator，实现按照orderId进行分组

map的输出：

key：orderBean

value：orderBean

orderBean.java

topN

hello host hello host is best qianfeng better
hadoop is good
spark is nice

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class orderBean implements WritableComparable<orderBean> {
    private String orderId;
    private String itemId;
    private double amount;

    public orderBean() {
    }

    public orderBean(String orderId, String itemId, double amount) {
        this.orderId = orderId;
        this.itemId = itemId;
        this.amount = amount;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getItemId() {
        return itemId;
    }

    public void setItemId(String itemId) {
        this.itemId = itemId;
    }

    public double getAmount() {
        return amount;
    }

    public void setAmount(double amount) {
        this.amount = amount;
    }

    @Override
    public int compareTo(orderBean o) {
        int tmp = orderId.compareTo(o.getOrderId());
        if (tmp == 0){
            tmp = amount - o.getAmount() > 0 ?-1:1;
        }

        return tmp;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(orderId);
        out.writeUTF(itemId);
        out.writeDouble(amount);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        orderId = in.readUTF();
        itemId = in.readUTF();
        amount = in.readDouble();
    }

    @Override
    public String toString() {
        return orderId + ',' +
                itemId + ',' +
                amount;
    }
}

orderGroupingComparator.java

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * 1、一般比较器实现implements RawComparator
 * 2、比较器一般有2种比较方式，一种是对象比较，一种是字节比较
 * 3、对于字节比较来说，默认是以类的第一个属性来进行比较
 * 4、字节比较，length参数一般要求输入对应数据类型的长度
 */
public class orderGroupingComparator extends WritableComparator {
    //用于控制shuffle过程中reduce端=对kv的聚合逻辑
    protected  orderGroupingComparator(){
        super(orderBean.class,true);
    }

    /**
     * 对象比较
     * 跟key的compareTo一致
     * @param a
     * @param b
     * @return
     */
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        orderBean aBean = (orderBean)a;
        orderBean bBean = (orderBean)b;
        return aBean.getOrderId().compareTo(bBean.getOrderId());
    }
}

OrderPationer.java

import org.apache.hadoop.mapreduce.Partitioner;

public class OrderPartitioner extends Partitioner<orderBean,orderBean> {
    @Override
    public int getPartition(orderBean key, orderBean value, int numPartitions) {
        //指定orderId相同的bean就发往同一个reduce task
        return (key.getOrderId().hashCode() & Integer.MAX_VALUE ) % numPartitions;
    }
}

TOPN.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TOPN {
    static class MyMapper extends Mapper<LongWritable, Text,orderBean,orderBean>{
        /**
         * 仅在map方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
        }

        /**
         * 抽象出来的方法，通常每读取一行数据就调用一次map方法
         * @param key
         * @param value
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] fields = line.split(",");

            String orderId = fields[0];
            String itemId = fields[1];
            double amount = Double.parseDouble(fields[2]);

            orderBean bean = new orderBean(orderId, itemId, amount);

            context.write(bean,bean);
        }

        /**
         * 仅在map方法执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    static class MyReducer extends Reducer<orderBean,orderBean,orderBean, NullWritable>{
        private int topN = 0;
        /**
         * 仅在reduce方法执行前调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            topN = Integer.parseInt( context.getConfiguration().get("gaoyuanyuan"));
        }

        /**
         * map阶段输出多少个key，就会调用多少次reduce
         * 框架会按照key'相同的kv数据分成一组，然后每组数据调用一次reduce
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(orderBean key, Iterable<orderBean> values, Context context) throws IOException, InterruptedException {
            //top 1
            int count = topN;
            for (orderBean bean:values) {
                context.write(bean,NullWritable.get());
                count --;

                if (count == 0 ){
                    break;
                }
            }


        }

        /**
         * 仅在reduce执行之后调用一次
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            super.cleanup(context);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        //外部参数的设置
        //外部传入参数
//        conf.set("topn",args[0]);

        conf.addResource("MyParams.xml");

        //用户自定的配置

        //获取job对象
        Job job = Job.getInstance(conf, "model01");

        //配置job的参数
        job.setJarByClass(TOPN.class);

        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(orderBean.class);
        job.setMapOutputValueClass(orderBean.class);

        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(orderBean.class);
        job.setOutputValueClass(NullWritable.class);

        //
//        job.setCombinerClass();
        job.setPartitionerClass(OrderPartitioner.class);
        job.setGroupingComparatorClass(orderGroupingComparator.class);
//        job.setNumReduceTasks();

        FileInputFormat.setInputPaths(job,new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\gpinput"));

        //先判断输出目录是否存在，如果存在则删除
//        FileSystem fs = FileSystem.get(conf);
        Path output = new Path("F:\\ideaproject\\NZ2002Demo\\wordcount\\gpoutput1");

//        if (fs.exists(output)){
//            fs.delete(output,true);
//        }

        FileOutputFormat.setOutputPath(job,output);

        //提交job
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

这个妹妹我见过

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
【大数据开发】MapReduce——MapReduce模板、求平均分、多文件输出、分区排序、求共同好友、单表连接、分组topN

MapReduce模板import com.qfedu.bigdata.HdfsUtils.hdfsUtil;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;impo
复制链接

扫一扫