MapReduce进阶(3)------------------Join功能的实现(连接不同的文件),资料在最后

1.用途:有时候需要拼接两个文件的东西,比如用户文件里面有用户信息,还有购物文件里面有用户的购物信息,这时候需要将两个不同的文件拼接起来。
2.思路:首先将文件的内容读出来放到一个容器里面方便使用,其次通过两个文件的相同属性,将同一个用户的不同订单连接起来
3.code:(后面有一种常见的特俗情况,更加节省时间和空间)
(1)

//JoinBean代码,里面实现了自定义类的序列化
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class JoinBean implements Writable{
    private String uid;
    private String age;
    private String gender;
    private String movieId;
    private String rating;

    private String table;




    public void set(String uid, String age, String gender, String movieId, String rating, String table) {
        this.uid = uid;
        this.age = age;
        this.gender = gender;
        this.movieId = movieId;
        this.rating = rating;
        this.table = table;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(uid);
        out.writeUTF(age);
        out.writeUTF(gender);
        out.writeUTF(movieId);
        out.writeUTF(rating);
        out.writeUTF(table);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        uid = in.readUTF();
        age = in.readUTF();
        gender = in.readUTF();
        movieId = in.readUTF();
        rating = in.readUTF();
        table = in.readUTF();
    }


    @Override
    public String toString() {
        return "JoinBean [uid=" + uid + ", age=" + age + ", gender=" + gender + ", movieId=" + movieId + ", rating="
                + rating + ", table=" + table + "]";
    }

    public String getUid() {
        return uid;
    }

    public void setUid(String uid) {
        this.uid = uid;
    }

    public String getAge() {
        return age;
    }

    public void setAge(String age) {
        this.age = age;
    }

    public String getGender() {
        return gender;
    }

    public void setGender(String gender) {
        this.gender = gender;
    }

    public String getMovieId() {
        return movieId;
    }

    public void setMovieId(String movieId) {
        this.movieId = movieId;
    }

    public String getRating() {
        return rating;
    }

    public void setRating(String rating) {
        this.rating = rating;
    }

    public String getTable() {
        return table;
    }

    public void setTable(String table) {
        this.table = table;
    }



}

(2)JoinMR(使用在eclipse上通过集群运行的方式启动,可以参考前面文章的启动集群的三种方式)

JoinMR

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 
 * 将两个不同文件的数据进行拼接
 * 有相同的Uid
 * @author Hailong
 *思路:map阶段读取问价内容,但是要记下文件的名称,方便之后的对接
 *reduce阶段分别读取文件内容
 *将rating文件的内容追加到User问价的用户后面
 */


public class JoinMR {
    public static class MapTask extends Mapper<LongWritable, Text, JoinBean, NullWritable>{
        Map<String,String> map = new HashMap<>();
        @Override
        protected void setup(Context context)  
                throws IOException, InterruptedException {
                //在setup阶段读一次用户表就可以,利用map将数据装好
        Configuration conf = context.getConfiguration();
        String smallTableName = conf.get("smallTableName");
        FileSystem fs = FileSystem.get(conf);
        FSDataInputStream open = fs.open(new Path(smallTableName));
        BufferedReader br = new BufferedReader(new InputStreamReader(open));
        String line = null;
        while((line = br.readLine())!=null) {
            String[] split = line.split("::");
            map.put(split[0], line);
        }
        }

        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {
                //直接根据map里面的数据找到对应的id拼接数据
            String[] split = value.toString().split("::");
            JoinBean joinBean = new JoinBean();
            String[] line = map.get(split[0]).split("::");
            joinBean.set(split[0], line[2], line[1], split[1], split[2], "null");
            context.write(joinBean,NullWritable.get());
        }
    }

    public static void main(String[] args) throws Exception{

        System.setProperty("HADOOP_USER_NAME", "root");
        Configuration conf = new Configuration();
        // 1、设置job运行时要访问的默认文件系统
        conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
        // 2、设置job提交到哪去运行
        conf.set("mapreduce.framework.name", "yarn");
        conf.set("yarn.resourcemanager.hostname", "hadoop01");
        // 3、如果要从windows系统上运行这个job提交客户端程序,则需要加这个跨平台提交的参数
        conf.set("mapreduce.app-submission.cross-platform","true");
        conf.set("smallTableName", args[2]);

       Job job = Job.getInstance(conf);
     // 1、封装参数:jar包所在的位置
       job.setJar("C:\\Users\\Hailong\\Desktop\\w2.jar"); 
       job.setMapperClass(MapTask.class);
       job.setOutputKeyClass(JoinBean.class);
       job.setOutputValueClass(NullWritable.class);

       FileSystem fs = FileSystem.get(conf);
       if(!fs.exists(new Path("/data/out/join"))) {
          fs.delete(new Path("/data/out/join"),true);
       }

       FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 5、封装参数:想要启动的reduce task的数量
        job.setNumReduceTasks(2);

       boolean completion = job.waitForCompletion(true);
        System.out.println(completion?"程序执行完毕,没毛病!!!":"程序有问题,程序出bug了,赶紧加班调试!!!");
   }





}

链接:https://pan.baidu.com/s/1XelYCYwnTKoQWZdLKWV5GA 密码:5ax5

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值