Hadoop04_commonfriendone-CSDN博客

本文链接：https://blog.csdn.net/weixin_42581821/article/details/95855881

文章目录

mr编程案例（一）
mapreduce内部核心工作原理
- hadoop中的具体实现
mr编程案例（二）
mapReduce程序在yarn上面启动运行的过程

mr编程案例（一）

maptask与输入切片关系示意图

注意：当maptask发送到dataNode节点运行时，就已经确定，参与运算的数据来自什么地方；（hdfs,mysql,…)
在这里插入图片描述

倒排索引案例（一）

目的：统计单词在每个文件中出现的次数

测试数据
hello tom
hello jim
hello kitty
hello rose
hello jerry
hello jim
hello kitty
hello jack
hello jerry
hello java
hello c++
hello c++
** 第一次mr**

package com.initialize.index;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexStepOne {

    public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        //产生<hello-文件名,1>
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //从输入切片信息中获取当前正在处理的一行数据所属的文件
            FileSplit inputSplit = (FileSplit)context.getInputSplit();
            String fileName = inputSplit.getPath().getName();

            String[] words = value.toString().split(" ");
            for(String w : words){
                //将“单词-文件名”作为key,1作为value,输出
                context.write(new Text(w + "-" + fileName), new IntWritable(1));
            }
        }
    }

    public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            int count = 0;
            for(IntWritable value : values){
                count += value.get();
            }
            context.write(key, new IntWritable(count));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //默认只加载core-default.xml, core-site.xml
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(IndexStepOne.class);

        job.setMapperClass(IndexStepOneMapper.class);
        job.setReducerClass(IndexStepOneReducer.class);

        job.setNumReduceTasks(3);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output1"));

        job.waitForCompletion(true);

    }

}

产生结果文件：
在这里插入图片描述

第二次mr

package com.initialize.index;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexStepTwo {
    public static class IndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
        //<hello-a.txt  4><hello-b.txt  4>
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("-");

            context.write(new Text(split[0]), new Text(split[1].replaceAll("\t", "-->")));
        }
    }

    public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
        //一组数据： <hello,a.txt-->4> <hello,b.txt-->4>  <hello,c.txt-->4>
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //Stringbuffer是线程安全的，StringBuilder是非线程安全的，在不涉及线程安全的情况下，StringBuilder更快
            StringBuilder sb = new StringBuilder();
            for(Text value : values){
                sb.append(value.toString()).append("\t");
            }
            context.write(key, new Text(sb.toString()));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(IndexStepTwo.class);

        job.setMapperClass(IndexStepTwoMapper.class);
        job.setReducerClass(IndexStepTwoReducer.class);
        job.setNumReduceTasks(1);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\output1"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output2"));

        job.waitForCompletion(true);
    }
}

产生结果文件：
在这里插入图片描述

订单topn（二）

目的：求每一个订单中金额最大的那几条。
** 测试数据**

order001,u001,小米6,1999.9,2
order001,u001,雀巢咖啡,99.0,2
order001,u001,安慕希,250.0,2
order001,u001,经典红双喜,200.0,4
order001,u001,防水电脑包,400.0,2
order002,u002,小米手环,199.0,3
order002,u002,榴莲,15.0,10
order002,u002,苹果,4.5,20
order002,u002,肥皂,10.0,40
order003,u001,小米6,1999.9,2
order003,u001,雀巢咖啡,99.0,2
order003,u001,安慕希,250.0,2
order003,u001,经典红双喜,200.0,4
order003,u001,防水电脑包,400.0,2

package com.initialize.order.topn;


import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class OrderBean implements WritableComparable<OrderBean> {

    private String orderId;
    private String userId;
    private String pdtName;
    private float price;
    private int number;
    private float amountFee;

    @Override
    public String toString() {
        return orderId + "," + userId + "," + pdtName + "," + price + "," + number + "," + amountFee;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPdtName() {
        return pdtName;
    }

    public void setPdtName(String pdtName) {
        this.pdtName = pdtName;
    }

    public float getPrice() {
        return price;
    }

    public void setPrice(float price) {
        this.price = price;
    }

    public int getNumber() {
        return number;
    }

    public void setNumber(int number) {
        this.number = number;
    }

    public float getAmountFee() {
        return amountFee;
    }

    public void setAmountFee(float amountFee) {
        this.amountFee = amountFee;
    }

    public void set(String orderId, String userId, String pdtName, float price, int number) {
        this.orderId = orderId;
        this.userId = userId;
        this.pdtName = pdtName;
        this.price = price;
        this.number = number;
        this.amountFee = price * number;
    }

    /**
     * 比较规则：先比总金额，如果相同，再比商品名称
     * @param o
     * @return
     */
    @Override
    public int compareTo(OrderBean o) {
        return Float.compare(o.getAmountFee(), this.getAmountFee())==0?this.getPdtName().compareTo(o.getPdtName()):Float.compare(o.getAmountFee(), this.getAmountFee());
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.orderId);
        out.writeUTF(this.userId);
        out.writeUTF(this.pdtName);
        out.writeFloat(this.price);
        out.writeInt(this.number);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readUTF();
        this.userId = in.readUTF();
        this.pdtName = in.readUTF();
        this.price = in.readFloat();
        this.number = in.readInt();
        this.amountFee = this.price * this.number;
    }
}

package com.initialize.order.topn;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;

public class OrderTopn {

    public static class OrderTopnMapper extends Mapper<LongWritable, Text, Text, OrderBean>{

        OrderBean orderBean = new OrderBean();
        Text k = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split(",");

            orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
            k.set(fields[0]);

            //从这里交给maptask的kv对象，会被maptask序列化后存储，所以不用担心覆盖的问题。
            context.write(k, orderBean);
        }
    }

    public static class OrderTopnReducer extends Reducer<Text, OrderBean, OrderBean, NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {

            //获取topn的参数
            int topn = context.getConfiguration().getInt("order.top.n", 3);

            ArrayList<OrderBean> beanList = new ArrayList<>();

            //reduce task提供的values迭代器，每次迭代返回给我们的都是同一个对象，只是set了不同的值
            for(OrderBean orderBean : values){

                //构造一个新的对象，来存储本次迭代出来的值
                OrderBean newBean = new OrderBean();
                newBean.set(orderBean.getOrderId(), orderBean.getUserId(), orderBean.getPdtName(), orderBean.getPrice(), orderBean.getNumber());

                beanList.add(newBean);
            }

            //对beanList中的orderBean对象排序（按总金额大小倒序排序，如果总金额相同按商品名称排序）
            Collections.sort(beanList);

            for(int i=0;i<topn;i++){
                context.write(beanList.get(i), NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        conf.setInt("order.top.n", 2);

        Job job = Job.getInstance(conf);
        job.setJarByClass(OrderBean.class);

        job.setMapperClass(OrderTopnMapper.class);
        job.setReducerClass(OrderTopnReducer.class);
        job.setNumReduceTasks(2);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(OrderBean.class);
        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\aaa.txt"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

查看结果：
在这里插入图片描述

GroupingComparator应用示例–求分组topn（三）

在这里插入图片描述

package com.initialize.order.topn.grouping;


import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class OrderBean implements WritableComparable<OrderBean> {

    private String orderId;
    private String userId;
    private String pdtName;
    private float price;
    private int number;
    private float amountFee;

    @Override
    public String toString() {
        return orderId + "," + userId + "," + pdtName + "," + price + "," + number + "," + amountFee;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPdtName() {
        return pdtName;
    }

    public void setPdtName(String pdtName) {
        this.pdtName = pdtName;
    }

    public float getPrice() {
        return price;
    }

    public void setPrice(float price) {
        this.price = price;
    }

    public int getNumber() {
        return number;
    }

    public void setNumber(int number) {
        this.number = number;
    }

    public float getAmountFee() {
        return amountFee;
    }

    public void setAmountFee(float amountFee) {
        this.amountFee = amountFee;
    }

    public void set(String orderId, String userId, String pdtName, float price, int number) {
        this.orderId = orderId;
        this.userId = userId;
        this.pdtName = pdtName;
        this.price = price;
        this.number = number;
        this.amountFee = price * number;
    }

    /**
     * 比较规则：按商品id排序，再按商品价格排序。
     * @param o
     * @return
     */
    @Override
    public int compareTo(OrderBean o) {
        return this.orderId.compareTo(o.getOrderId())==0?Float.compare(o.getAmountFee(), this.getAmountFee()):this.orderId.compareTo(o.getOrderId());
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.orderId);
        out.writeUTF(this.userId);
        out.writeUTF(this.pdtName);
        out.writeFloat(this.price);
        out.writeInt(this.number);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readUTF();
        this.userId = in.readUTF();
        this.pdtName = in.readUTF();
        this.price = in.readFloat();
        this.number = in.readInt();
        this.amountFee = this.price * this.number;
    }
}

package com.initialize.order.topn.grouping;


import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
 * reducetask会调用该方法，将不同的数据划分成不同的组。
 * orderId相同的为同一组
 */
public class OrderIdGroupingComparator extends WritableComparator {
    public OrderIdGroupingComparator(){
        super(OrderBean.class, true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {

        OrderBean o1 = (OrderBean)a;
        OrderBean o2 = (OrderBean)b;

        return o1.getOrderId().compareTo(o2.getOrderId());
    }
}

package com.initialize.order.topn.grouping;


import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.mapreduce.Partitioner;

/**
 * maptask通过调用该函数，将orderId相同的数据分发到同一个分区中。
 */
public class OrderIdPartitioner extends Partitioner<OrderBean, NullWritable> {
    @Override
    public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
        return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
    }

}

package com.initialize.order.topn.grouping;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class OrderTopn {

    public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{

        OrderBean orderBean = new OrderBean();
        NullWritable v = NullWritable.get();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] fields = value.toString().split(",");

            orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));

            //从这里交给maptask的kv对象，会被maptask序列化后存储，所以不用担心覆盖的问题。
            context.write(orderBean, v);
        }
    }

    public static class OrderTopnReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
    	/**
		 * 虽然reduce方法中的参数key只有一个，但是只要迭代器迭代一次，key中的值就会变
		 */
        @Override
        protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {

            //获取topn的参数
            int topn = context.getConfiguration().getInt("order.top.n", 3);

            int i=0;
            for(NullWritable v : values){
                context.write(key, v);
                if(++i == topn) return;
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        conf.setInt("order.top.n", 2);

        Job job = Job.getInstance(conf);
        job.setJarByClass(OrderBean.class);

        job.setMapperClass(OrderTopnMapper.class);
        job.setReducerClass(OrderTopnReducer.class);

        job.setPartitionerClass(OrderIdPartitioner.class);
        job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
        job.setNumReduceTasks(2);

        job.setMapOutputKeyClass(OrderBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(OrderBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\aaa.txt"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

共同的好友（四）

目的：求出那些人之间有共同好友，及共同好友都是那些人。
测试数据：

A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J

package com.initialize.friend;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;

/**
 * 求两个人有那些共同好友
 */
public class CommonFriendsOne {

    public static class CommonFriendOneMapper extends Mapper<LongWritable, Text, Text, Text>{
        Text k = new Text();
        Text v = new Text();
        //A:B,C,D,F,E,O
        //输出：B->A  C->A  D->A

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] userAndFriends = value.toString().split(":");

            String user = userAndFriends[0];
            String[] friends = userAndFriends[1].split(",");
            v.set(user);
            for(String f : friends){
                k.set(f);
                context.write(k, v);
            }
        }
    }

    public static class CommonFriendOneReducer extends Reducer<Text, Text, Text, Text>{
        //一组数据：B --> A E F J
        //一组数据：C --> B F E J...
        //前者是后者所有人共同的朋友
        @Override
        protected void reduce(Text friend, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            ArrayList<String> userList = new ArrayList();

            for(Text user : values){
                userList.add(user.toString());
            }

            Collections.sort(userList);

            for(int i=0;i<userList.size()-1;i++){
                for(int j=i+1;j<userList.size();j++){
                    context.write(new Text(userList.get(i)+"-"+userList.get(j)),friend);
                }
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(CommonFriendsOne.class);

        job.setMapperClass(CommonFriendOneMapper.class);
        job.setReducerClass(CommonFriendOneReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\xx.txt"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

package com.initialize.friend;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;

/**
 * 求两个人有那些共同好友
 */
public class CommonFriendsTwo {

    public static class CommonFriendsTwoMapper extends Mapper<LongWritable, Text, Text, Text>{
        Text k = new Text();
        Text v = new Text();
        //B-C	A
        // B-D	A
        // B-F	A
        // B-G	A
        // B-H	A
        // B-I	A
        // B-K	A

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] current = value.toString().split("\t");
            k = new Text(current[0]);
            v = new Text(current[1]);
            context.write(k, v);

        }
    }

    public static class CommonFriendsTwoReducer extends Reducer<Text, Text, Text, Text>{
        //一组数据：B-G A C D E
        @Override
        protected void reduce(Text friend, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            StringBuilder friends = new StringBuilder();

            for(Text user : values){
                friends.append("-->" + user + " ");
            }

            context.write(friend, new Text(friends.toString()));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(CommonFriendsTwo.class);

        job.setMapperClass(CommonFriendsTwoMapper.class);
        job.setReducerClass(CommonFriendsTwoReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output2"));

        job.waitForCompletion(true);
    }
}

运行结果：
在这里插入图片描述

替换默认的文本输入输出组件为sequence文件输入输出组件（五）

package com.initialize.index.sequence;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;

public class IndexStepOne {

    public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        //产生<hello-文件名,1>
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //从输入切片信息中获取当前正在处理的一行数据所属的文件
            FileSplit inputSplit = (FileSplit)context.getInputSplit();
            String fileName = inputSplit.getPath().getName();

            String[] words = value.toString().split(" ");
            for(String w : words){
                //将“单词-文件名”作为key,1作为value,输出
                context.write(new Text(w + "-" + fileName), new IntWritable(1));
            }
        }
    }

    public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            int count = 0;
            for(IntWritable value : values){
                count += value.get();
            }
            context.write(key, new IntWritable(count));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //默认只加载core-default.xml, core-site.xml
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(IndexStepOne.class);

        job.setMapperClass(IndexStepOneMapper.class);
        job.setReducerClass(IndexStepOneReducer.class);

        job.setNumReduceTasks(3);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // job.setInputFormatClass(TextInputFormat.class); 默认的输入组件
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output1"));

        job.waitForCompletion(true);

    }

}

中间结果：
在这里插入图片描述
sequence数据结构入上图。

package com.initialize.index.sequence;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class IndexStepTwo {
    public static class IndexStepTwoMapper extends Mapper<Text, IntWritable, Text, Text>{
        @Override
        protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {
            String[] split =  key.toString().split("-");

            context.write(new Text(split[0]), new Text(split[1] + "-->" + value));
        }
    }

    public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
        //一组数据： <hello,a.txt-->4> <hello,b.txt-->4>  <hello,c.txt-->4>
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            //Stringbuffer是线程安全的，StringBuilder是非线程安全的，在不涉及线程安全的情况下，StringBuilder更快
            StringBuilder sb = new StringBuilder();
            for(Text value : values){
                sb.append(value.toString()).append("\t");
            }
            context.write(key, new Text(sb.toString()));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(IndexStepTwo.class);

        job.setMapperClass(IndexStepTwoMapper.class);
        job.setReducerClass(IndexStepTwoReducer.class);
        job.setNumReduceTasks(1);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // job.setInputFormatClass(TextInputFormat.class); 默认的输入组件
        job.setInputFormatClass(SequenceFileInputFormat.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\output1"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output2"));

        job.waitForCompletion(true);
    }
}

结果：正常
在这里插入图片描述

mapreduce内部核心工作原理

在这里插入图片描述

hadoop中的具体实现

在这里插入图片描述

mr编程案例（二）

join算法的代码实现（一）

实现：select a.,b. from a join b on a.uid=b.uid;
测试数据：
order.txt

order001,u001
order002,u001
order003,u005
order004,u002
order005,u003
order006,u004

user.txt

u001,senge,18,angelababy
u002,laozhao,48,ruhua
u003,xiaoxu,16,chunge
u004,laoyang,28,zengge
u005,nana,14,huangbo

package com.initialize.join;


import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class JoinBean implements Writable {

    private String orderId;
    private String userId;
    private String userName;
    private int userAge;
    private String userFriend;
    private String tableName;

    public void set(String orderId, String userId, String userName, int userAge, String userFriend, String tableName) {
        this.orderId = orderId;
        this.userId = userId;
        this.userName = userName;
        this.userAge = userAge;
        this.userFriend = userFriend;
        this.tableName = tableName;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getUserName() {
        return userName;
    }

    public void setUserName(String userName) {
        this.userName = userName;
    }

    public int getUserAge() {
        return userAge;
    }

    public void setUserAge(int userAge) {
        this.userAge = userAge;
    }

    public String getUserFriend() {
        return userFriend;
    }

    public void setUserFriend(String userFriend) {
        this.userFriend = userFriend;
    }

    public String getTableName() {
        return tableName;
    }

    public void setTableName(String tableName) {
        this.tableName = tableName;
    }

    @Override
    public String toString() {
        return this.orderId + "," + this.userId + "," + this.userAge + "," + this.userName + "," + this.userFriend;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.orderId);
        out.writeUTF(this.userId);
        out.writeUTF(this.userName);
        out.writeInt(this.userAge);
        out.writeUTF(this.userFriend);
        out.writeUTF(this.tableName);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readUTF();
        this.userId = in.readUTF();
        this.userName = in.readUTF();
        this.userAge = in.readInt();
        this.userFriend = in.readUTF();
        this.tableName = in.readUTF();
    }
}

package com.initialize.join;


import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

/**
 * 本例是最差的实现方式
 *
 * 还可以利用Partitoner + CompareTo + GroupingComparator组合拳来高效实现
 */
public class ReduceSideJoin {

    public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text, Text, JoinBean>{

        String fileName = null;
        JoinBean bean = new JoinBean();
        Text k = new Text();

        /**
         * maptask在做数据处理时，会先调用一次setup(),调完后才对每一行反复调用map()
         */
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            FileSplit inputSplit = (FileSplit)context.getInputSplit();
            fileName = inputSplit.getPath().getName();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] fields = value.toString().split(",");

            if(fileName.startsWith("order")){
                bean.set(fields[0], fields[1], "NULL", -1, "NULL", "order");
            }else{
                bean.set("NULL", fields[0], fields[1], Integer.parseInt(fields[2]), fields[3], "user");
            }
            k.set(bean.getUserId());
            context.write(k, bean);;
        }
    }

    public static class ReduceSideJoinReducer extends Reducer<Text, JoinBean, JoinBean, NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<JoinBean> beans, Context context) throws IOException, InterruptedException {
            //缓存订单数据
            ArrayList<JoinBean> orderList = new ArrayList<>();
            JoinBean userBean = null; //缓存用户数据

//将文件中的数据大量保存在缓存中，会占用大量的缓存。
//如果迭代出来的第一条数据就是user数据，在将大量的order数据保存到缓存中就会浪费内存。
//只要控制号排序规则将用户数据排在所有数据的第一条，就没必要将订单数据缓存。
            try {
                //区分两类数据
                for(JoinBean bean : beans){
                    if("order".equals(bean.getTableName())){
                        JoinBean newBean = new JoinBean();
                        BeanUtils.copyProperties(newBean, bean);
                        orderList.add(newBean);
                    }else{
                        userBean = new JoinBean();
                        BeanUtils.copyProperties(userBean, bean);
                    }
                }

                //拼接数据，并输出
                for(JoinBean bean : orderList){
                    bean.setUserName(userBean.getUserName());
                    bean.setUserAge(userBean.getUserAge());
                    bean.setUserFriend(userBean.getUserFriend());

                    context.write(bean, NullWritable.get());
                }
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (InvocationTargetException e) {
                e.printStackTrace();
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(ReduceSideJoin.class);

        job.setMapperClass(ReduceSideJoinMapper.class);
        job.setReducerClass(ReduceSideJoinReducer.class);

        job.setNumReduceTasks(2);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(JoinBean.class);
        job.setOutputKeyClass(JoinBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

mr数据倾斜–利用Combiner组件（二）

在这里插入图片描述

maptask局部聚合数据来减轻数据倾斜影响

利用Combiner组件，在maptask局部聚合减少网络数据的传输量，从而减轻数据倾斜。Combiner的本质就是Reducer，需要继承Reucer接口。

package com.initialize.wc.skew;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Random;

public class SkewWordCount {

    public static class SkewWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        Random random = new Random();
        Text k = new Text();
        IntWritable v = new IntWritable(1);
        int numReduceTasks = 0;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            numReduceTasks = context.getNumReduceTasks();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] words = value.toString().split(" ");
            for(String w : words){
                k.set(w + "\001" + random.nextInt(numReduceTasks));
                context.write(k, v);
            }
        }
    }

    public static  class  SkewWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

        IntWritable v = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for(IntWritable value : values){
                count += value.get();
            }
            v.set(count);
            context.write(key, v);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SkewWordCount.class);

        job.setMapperClass(SkewWordCountMapper.class);
        job.setReducerClass(SkewWordCountReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        //设置maptask端的局部聚合逻辑类
        //maptask会调用该类进行局部聚合，指定类必须继承reducer接口。
        job.setCombinerClass(SkewWordCountReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output1"));

        job.setNumReduceTasks(3);

        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);

    }
}

数据倾斜的通用解决方案-打散倾斜的key

经过第一次mr生成的数据后，再次对结果进行mr计算。

package com.initialize.wc.skew;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import java.io.IOException;

public class SkewWordCount2 {

    public static class SkewWordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        Text k = new Text();
        IntWritable v = new IntWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] wordAndCount = value.toString().split("\t");
            v.set(Integer.parseInt(wordAndCount[1]));
            k.set(wordAndCount[0].split("\001")[0]);

            context.write(k, v);
        }
    }

    public static class SkewWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        IntWritable v = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for(IntWritable value : values){
                count += value.get();
            }
            v.set(count);
            context.write(key, v);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(SkewWordCount2.class);
        job.setMapperClass(SkewWordCountMapper.class);
        job.setReducerClass(SkewWordCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setCombinerClass(SkewWordCountReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\output1"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output2"));

        job.setNumReduceTasks(3);

        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }

}

mapReduce程序在yarn上面启动运行的过程

在这里插入图片描述
将mr程序打成jar,在Linux上面运行程序
1.启动RunJar
2.启动MRAppMaster(在任意一个datanode上面启动）
3.启动YarnChild（在多个datanode上启动，运行maptask程序）
4.有YarnChild(maptask)运行完毕后，启动YarnChild(在多个datanode上面启动，运行reducetask程序)
5.YarnChild(reducetask)运行完毕后，关闭MRAppMaster程序
6.退出RunJar