mapreduce-全局排序 -夜幕思年华

  • 脚本数据
    生成随机数到文件
  for k in $(seq 1 10000)
  do
  echo $RANDOM>>test_data.txt;
  done

创建目录
hdfs dfs -mkdir /data
上传脚本文件
hdfs dfs -put test_data.txt /data
package com.doop.gw.hadoop.reducer.order;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.junit.Test;


public class TotalSort extends Configured implements Tool {
  static final String INPUT_PATH = "hdfs://192.168.33.104:8020/data/test_data.txt";
  static final String OUT_PATH = "hdfs://192.168.33.104:8020/total_sort_output";
  static final String OUT_PATH_1 = "hdfs://192.168.33.104:8020/total_sort_partitions";

  //实现一个Kye比较器,用于比较两个key的大小,将key由字符串转化为Integer,然后进行比较。
  public static class KeyComparator extends WritableComparator {
    protected KeyComparator() {
      super(Text.class, true);
    }

    @Override
    public int compare(WritableComparable writableComparable1, WritableComparable writableComparable2) {
      int num1 = Integer.parseInt(writableComparable1.toString());
      int num2 = Integer.parseInt(writableComparable2.toString());
      return num1 - num2;
    }
  }
  @Override
  public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
    Path outPath = new Path(OUT_PATH);
    if (fileSystem.exists(outPath)) {
      fileSystem.delete(outPath, true);
    }

    conf.set("mapreduce.totalorderpartitioner.naturalorder", "false");
    Job job = Job.getInstance(conf, "Total Sort app");
    job.setJarByClass(TotalSort.class);

    //设置读取文件的路径,都是从HDFS中读取。读取文件路径从脚本文件中传进来
    FileInputFormat.addInputPath(job,new Path(INPUT_PATH));
    //设置mapreduce程序的输出路径,MapReduce的结果都是输入到文件中
    FileOutputFormat.setOutputPath(job,new Path(OUT_PATH));

    //设置比较器,用于比较数据的大小,然后按顺序排序,该例子主要用于比较两个key的大小
    job.setSortComparatorClass(KeyComparator.class);


    job.setNumReduceTasks(3);//设置reduce数量

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(NullWritable.class);
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    //设置保存partitions文件的路径
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path(OUT_PATH_1));

    //key值采样,0.01是采样率,
    InputSampler.Sampler<Text, Text> sampler = new InputSampler.RandomSampler<>(0.01, 1000, 100);
    //将采样数据写入到分区文件中
    InputSampler.writePartitionFile(job, sampler);

    job.setMapperClass(TotalSortMap.class);
    job.setReducerClass(TotalSortReduce.class);
    //设置分区类。
    job.setPartitionerClass(TotalOrderPartitioner.class);
    return job.waitForCompletion(true) ? 0 : 1;
  }
  public static void main(String[] args)throws Exception{
    int exitCode = ToolRunner.run(new TotalSort(), args);
    System.exit(exitCode);
  }
  /**
   * 读取文件
   */
  @Test
  public void testRead() {
    Configuration conf = new Configuration();
    try {
      // 根据配置文件创建HDFS对象
      conf.set("fs.defaultFS", "hdfs://192.168.33.104:8020"); //远程需要设定
      FileSystem fs = FileSystem.get(conf);
      Path path = new Path("hdfs://192.168.33.104:8020/total_sort_output/part-r-00000");
      FSDataInputStream fis = fs.open(path);
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      IOUtils.copyBytes(fis, baos, 1024);
      fis.close();
      System.out.println(new String(baos.toByteArray()));
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}
package com.doop.gw.hadoop.reducer.order;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TotalSortMap extends Mapper<Text, Text, Text, IntWritable> {
  @Override
  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    context.write(key, new IntWritable(Integer.parseInt(key.toString())));
  }
}
package com.doop.gw.hadoop.reducer.order;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TotalSortReduce extends Reducer<Text, IntWritable, IntWritable, NullWritable> {

  @Override
  protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    for (IntWritable value : values) {
      context.write(value, NullWritable.get());
    }
  }
}
  • 执行过程中遇到没有权限
    hadoop user=xiaocai, access=WRITE, inode="/":root:supergroup:drw

  • 解决
    修改hdfs-site.xml 增加

    <property>
          <name>dfs.permissions</name>
          <value>false</value>
    </property>
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值