6、addCacheFile向map发送小文件实现map端join

最新推荐文章于 2022-02-17 22:27:28 发布

weixin_34220179

最新推荐文章于 2022-02-17 22:27:28 发布

阅读量130

点赞数

文章标签：大数据 java python

原文链接：https://my.oschina.net/liufukin/blog/799161

版权

2019独角兽企业重金招聘Python工程师标准>>>

1、需求

大表和小表进行join的时候，可以将小表发送到map端，然后将小表加载到内存，这样可以快速的进行join匹配

比如：求每个用户购物清单的中每种商品总金额

2、思路

可以将商品表文件，addChechFiule的形式发送到每一个map的工作目录下，然后加载到内存的hashMap中，这样来一条用户的订单就和hashMap进行对比即可。

按照按照用户分区按照用户、商品分组

3、代码

1、ShopingBean.class

  import org.apache.hadoop.io.WritableComparable;

  import java.io.DataInput;
  import java.io.DataOutput;
  import java.io.IOException;

  /**
   * [@Author](https://my.oschina.net/arthor) liufu
   */
  public class ShopingBean implements WritableComparable<ShopingBean> {

      private String userId;
      private String productName;
      private int payCount;

      [@Override](https://my.oschina.net/u/1162528)
      public void write(DataOutput out) throws IOException {
          out.writeUTF(userId);
          out.writeUTF(productName);
          out.writeInt(payCount);
      }

      [@Override](https://my.oschina.net/u/1162528)
      public void readFields(DataInput in) throws IOException {
          this.userId = in.readUTF();
          this.productName = in.readUTF();
          this.payCount = in.readInt();
      }

      [@Override](https://my.oschina.net/u/1162528)
      public int compareTo(ShopingBean o) {
          //先用户排序
          int useridCompare = this.userId.compareTo(o.getUserId());
          if (useridCompare != 0){
              return -useridCompare;
          }

          //再本用户，本商品排序
          int produceCompare = this.productName.compareTo(o.getProductName());
          if (produceCompare != 0){
              return -produceCompare;
          }

          //再本用户、本商品、总价钱排序
          return this.payCount > o.getPayCount() ? -1 : 1;
      }


      public String getUserId() {
          return userId;
      }

      public void setUserId(String userId) {
          this.userId = userId;
      }

      public String getProductName() {
          return productName;
      }

      public void setProductName(String productName) {
          this.productName = productName;
      }

      public int getPayCount() {
          return payCount;
      }

      public void setPayCount(int payCount) {
          this.payCount = payCount;
      }
  }

2、JoinInMapMapper.class

  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.io.IntWritable;
  import org.apache.hadoop.io.LongWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Mapper;

  import java.io.BufferedReader;
  import java.io.FileReader;
  import java.io.IOException;
  import java.util.HashMap;

  /**
   * [@Author](https://my.oschina.net/arthor) liufu
   */
  public class JoinInMapMapper extends Mapper<LongWritable, Text, ShopingBean, IntWritable>{
      HashMap<String, Integer> productMap = null;
      ShopingBean k = null;
      IntWritable v = null;
      @Override
      protected void setup(Context context) throws IOException, InterruptedException {
          productMap = new HashMap<>();
          k = new ShopingBean();
          v = new IntWritable();

          //拿到applicationRun那边创建的conf对象
          Configuration conf = context.getConfiguration();
          String addCacheFile = conf.get("addCacheFile");
          String splitField = conf.get("splitField");

          //获取addChachFile发送过来的小文件，并加载到内存的hashmap
          BufferedReader bf = new BufferedReader(new FileReader(addCacheFile));
          String tmpLine = null;
          while ((tmpLine = bf.readLine()) != null){
              String[] fields = tmpLine.split(splitField);
              productMap.put(fields[0], Integer.parseInt(fields[1]));
          }
      }

      @Override
      protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
          String productLine = value.toString();
          String[] fields = productLine.split("\t");

          k.setUserId(fields[0]);
          k.setProductName(fields[1]);
          Integer price = productMap.get(fields[1]);
          k.setPayCount(price * Integer.parseInt(fields[2]));

          v.set(k.getPayCount());
          context.write(k, v);
      }

      @Override
      protected void cleanup(Context context) throws IOException, InterruptedException {
          productMap.clear();
          productMap = null;
          k = null;
          v = null;
      }
  }

3、JoinInMapReducer.class

  import org.apache.hadoop.io.IntWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Reducer;

  import java.io.IOException;

  /**
   * @Author liufu
   */
  public class JoinInMapReducer extends Reducer<ShopingBean, IntWritable, Text, IntWritable>{

      @Override
      protected void reduce(ShopingBean key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
          int allCount = 0;
          for (IntWritable count : values){
              allCount += count.get();
          }

          context.write(new Text(key.getUserId() + " " + key.getProductName()), new IntWritable(allCount));
      }
  }

4、JoinInMapPartitioner.class

  import org.apache.hadoop.io.IntWritable;
  import org.apache.hadoop.mapreduce.Partitioner;

  /**
   * @Author liufu
   */
  public class JoinInMapPartitioner extends Partitioner<ShopingBean, IntWritable>{ //key ，value 对应map的输出类型
      @Override
      public int getPartition(ShopingBean shopingBean, IntWritable intWritable, int numPartitions) {
          return shopingBean.getUserId().hashCode() & Integer.MAX_VALUE % numPartitions;
      }
  }

5、JoinInMapGroupComparetor.class

  import org.apache.hadoop.io.WritableComparable;
  import org.apache.hadoop.io.WritableComparator;

  /**
   * @Author liufu
   */
  public class JoinInMapGroupComparetor extends WritableComparator{

      //这个一定要写，否则会报空指针异常，因为compare方法中无法进行类型转换
      public JoinInMapGroupComparetor(){
          super(ShopingBean.class, true);
      }

      @Override
      public int compare(WritableComparable a, WritableComparable b) {
          ShopingBean pre = (ShopingBean) a;
          ShopingBean after = (ShopingBean) b;

          int userIdCompare = pre.getUserId().compareTo(after.getUserId());
          int productCompare = pre.getProductName().compareTo(after.getProductName());

          //不能够是return userIdCompare ==0 && productCompare == 0 ? -1 : 1;
          //只有返回时0==>才表示相等，返回1表示大于，返回-1 表示小于
          return userIdCompare ==0 && productCompare == 0 ? 0 : 1;
      }
  }

6、JoinInMapRun.class

  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.IntWritable;
  import org.apache.hadoop.io.Text;
  import org.apache.hadoop.mapreduce.Job;
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

  import java.io.IOException;
  import java.net.URI;
  import java.net.URISyntaxException;

  /**
   * @Author liufu
   */
  public class JoinInMapRun {
      public static void main(String[] args) throws IOException, URISyntaxException {
          Configuration conf = new Configuration();
          conf.set("addCacheFile","product.txt");
          conf.set("splitField","\t");
          Job job = new Job(conf, "joinInMapper");

          //通过classpath中主类找到jar
          job.setJarByClass(JoinInMapRun.class);

          job.addCacheFile(new URI("hdfs://192.168.0.186:9000/joininmapper/product.txt"));
          job.setPartitionerClass(JoinInMapPartitioner.class);
          job.setGroupingComparatorClass(JoinInMapGroupComparetor.class);

          //job的map端和reduce端代码
          job.setMapperClass(JoinInMapMapper.class);
          job.setReducerClass(JoinInMapReducer.class);

          //设置map端和reduce输出的类型，这样才能够做反射得到对应的类
          job.setMapOutputKeyClass(ShopingBean.class);
          job.setMapOutputValueClass(IntWritable.class);
          job.setOutputKeyClass(Text.class);
          job.setOutputValueClass(IntWritable.class);

          //job 如何读取数据，如何写出数据
          job.setInputFormatClass(TextInputFormat.class);
          job.setOutputFormatClass(TextOutputFormat.class);

          //job 的数据从哪里来;  绑定输入目录，可以使用setInputPaths, 也可以使用 addInputPaths
          FileInputFormat.setInputPaths(job, new Path("/joininmapper/input1/"),new Path("/joininmapper/input2/"));

          //写到哪里去
          FileOutputFormat.setOutputPath(job, new Path("/joininmapper/output/"));

          try {
              boolean b = job.waitForCompletion(true);
              System.exit(b == true ? 0 : 1);
          } catch (InterruptedException e) {
              e.printStackTrace();
          } catch (ClassNotFoundException e) {
              e.printStackTrace();
          }
      }
  }

转载于:https://my.oschina.net/liufukin/blog/799161