hadoop中大数据全排序

最新推荐文章于 2022-07-04 00:02:44 发布

smallboy2011

最新推荐文章于 2022-07-04 00:02:44 发布

阅读量1.5k

点赞数

分类专栏： hadoop

本文链接：https://blog.csdn.net/smallboy2011/article/details/72900915

版权

hadoop 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

知识点：

数据采样：

InputSampler.RandomSampler

分区：

TotalOrderPartitioner
自定义InputFormat

MyKeyValueTextInputFormat

网上其他类似的程序直接使用KeyValueTextInputFormat。我在使用的时候发现，数据最后是按照字典序排序的，并没有做到全排序。原因可能是默认的KeyValueTextInputFormat需要的数据类型为Text，最后生成的采样文件中数据的类型也为Text，Map端的输出Key的类型也为Text，Map端的输出数据在划分分区的时候受制于Text类型，并不能做到全排序，所以自己
写了一个MyKeyValueTextInputFormat，让其生成的Key为LongWrite类型的。

代码如下：
public class MyKeyValueTextInputFormat extends FileInputFormat<LongWritable, Text> {
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
        context.setStatus(inputSplit.toString());
        return new MyKeyValueLineRecordReader(context.getConfiguration());
    }
    protected boolean isSplitable(JobContext context, Path file) {
        CompressionCodec codec = (new CompressionCodecFactory(context.getConfiguration())).getCodec(file);
        return null == codec?true:codec instanceof SplittableCompressionCodec;
    }
}


public class MyKeyValueLineRecordReader extends RecordReader<LongWritable, Text> {
    public static final String KEY_VALUE_SEPERATOR = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
    private final LineRecordReader lineRecordReader = new LineRecordReader();
    private byte separator = 9;
    private Text innerValue;
    private LongWritable key;
    private Text value;

    public Class getKeyClass() {
        return Text.class;
    }

    public MyKeyValueLineRecordReader(Configuration conf) throws IOException {
        String sepStr = conf.get("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");
        this.separator = (byte)sepStr.charAt(0);
    }

    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
        this.lineRecordReader.initialize(genericSplit, context);
    }

    public static int findSeparator(byte[] utf, int start, int length, byte sep) {
        for(int i = start; i < start + length; ++i) {
            if(utf[i] == sep) {
                return i;
            }
        }

        return -1;
    }

    public static void setKeyValue(LongWritable key, Text value, Text line, int lineLen, int pos) {
        if(pos == -1) {
                key.set(Long.parseLong(line.toString()));
                value.set("");
        } else {
                key.set(Long.parseLong(line.toString().substring(0,lineLen)));
                value.set(line.toString().substring(lineLen));
        }

    }

    public synchronized boolean nextKeyValue() throws IOException {
        if(this.lineRecordReader.nextKeyValue()) {
            this.innerValue = this.lineRecordReader.getCurrentValue();
            //byte[] line = this.innerValue.getBytes();
            int lineLen = this.innerValue.getLength();
            if(this.innerValue == null||this.innerValue.toString().equals("")) {
                return false;
            } else {
                if(this.key == null) {
                    this.key = new LongWritable();
                }

                if(this.value == null) {
                    this.value = new Text();
                }

                int pos = findSeparator(this.innerValue.getBytes(), 0, lineLen, this.separator);
                setKeyValue(this.key, this.value, this.innerValue, lineLen, pos);
                return true;
            }
        } else {
            return false;
        }
    }

    public LongWritable getCurrentKey() {
        return this.key;
    }

    public Text getCurrentValue() {
        return this.value;
    }

    public float getProgress() throws IOException {
        return this.lineRecordReader.getProgress();
    }

    public synchronized void close() throws IOException {
        this.lineRecordReader.close();
    }
}
public class TotalSortV3 extends Configured implements Tool {
        static class SimpleMapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
            protected void map(LongWritable key, Text value,
                               Context context) throws IOException, InterruptedException {
                context.write(key, key);
            }
        }

        static class SimpleReducer extends Reducer<LongWritable, LongWritable, LongWritable, NullWritable> {

            protected void reduce(LongWritable key, Iterable<LongWritable> values,
                                  Mapper.Context context) throws IOException, InterruptedException {
                for (LongWritable value : values)
                    context.write(value, NullWritable.get());
            }
        }



        public int run(String[] args) throws Exception {
            Configuration conf = getConf();
            Job job = Job.getInstance(conf, "Total Order Sorting");
            job.setJarByClass(TotalSortV3.class);

            //不能使用默认的TextInputFormat,默认的TextInputFormat会将行号偏移量作为key，导致对行号采样。我们是将每行的内容采样
            job.setInputFormatClass(MyKeyValueTextInputFormat.class);



            FileInputFormat.addInputPath(job, new Path("hdfs://192.168.12.150:9000/datasort/input/hadoopsortdata100000.txt"));
            FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.12.150:9000/datasort/output3/result"));
            job.setNumReduceTasks(10);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(LongWritable.class);
            job.setOutputKeyClass(LongWritable.class);
            job.setOutputValueClass(NullWritable.class);

            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("hdfs://192.168.12.150:9000/datasort/output3/_partition"));
            InputSampler.Sampler<LongWritable, Text> sampler = new InputSampler.RandomSampler<LongWritable, Text>(0.01, 1000, 100);
            InputSampler.writePartitionFile(job, sampler);

            job.setPartitionerClass(TotalOrderPartitioner.class);
            job.setMapperClass(SimpleMapper.class);
            job.setReducerClass(SimpleReducer.class);

            job.setJobName("TotalSortV3");

            return job.waitForCompletion(true) ? 0 : 1;
        }

        public static void main(String[] args) throws Exception {
            int exitCode = ToolRunner.run(new TotalSortV3(), args);
            System.exit(exitCode);
        }
}

测试数据生成：
[hadoop@master ~]$ cat hadoopsortdata100000.sh
#!/bin/bash
i=1
while ((i<=100000))
do
 echo $RANDOM >> hadoopsortdata100000.txt
 let  i++ 
done