Hadoop全排序原理和代码

最新推荐文章于 2024-04-17 16:49:54 发布

IT编程学习栈

最新推荐文章于 2024-04-17 16:49:54 发布

阅读量262

点赞数

分类专栏： mapreduce 文章标签： mapreduce java 大数据 hadoop spark

本文链接：https://blog.csdn.net/weixin_43169271/article/details/104163140

版权

mapreduce 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

hadoop全排序实现方法
1.设置一个 reduce
2.自定义分区函数
3.hadoop采样器
设置随机采样，切片采样
抽取样本数据，计算区间值，把区间值写入分区文件，分区文件就是序列文件，此序列文件只有key，没有value，value为null

第一部分（设置一个reduce）
准备mapper，reducer，test类

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WCMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line =value.toString();
String arr[] = line.split(" ");
context.write(new IntWritable(Integer.parseInt(arr[0])),new IntWritable(Integer.parseInt(arr[1])));
}
}

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReduce extends Reducer<IntWritable, IntWritable,IntWritable,IntWritable> {
protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
int max=Integer.MIN_VALUE;
for (IntWritable iw:values){
max=max>iw.get()? max:iw.get();
}
context.write(key,new IntWritable(max));
}
}

package com.MaxTemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WCTest {

public static void main(String[] args) throws Exception{

    Configuration conf=new Configuration();
    conf.set("fs.defaultFS","file:///");
    Job job=Job.getInstance(conf);
    job.setJobName("WCTest");
 
    job.setJarByClass(WCTest.class);
    job.setInputFormatClass(TextInputFormat.class);

    args = new String[]{"file:///d:/mr/b.txt","file:///d:/mr/out"};
    FileInputFormat.addInputPath(job,new Path(args[0]));
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    job.setPartitionerClass(YearPartitioner.class);
    job.setMapperClass(WCMapper.class);
    job.setReducerClass(WCReduce.class);

    job.setNumReduceTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.waitForCompletion(true);
}

}

准备天气的数据。

package com.MaxTemp;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class Demo {
public static void main(String[] args) throws IOException {
String url=“a.txt”;
File file=new File(“d:/mr/a.txt”);
if (file.exists()){
System.out.println(“文件已存在”);
}
FileWriter fw=new FileWriter(file);
for (int x=1960;x<=2060;x++){
for (int y=-30;y<=60;y++){
fw.write(x+" “+y);
fw.write(”\n");

        }

    }
    fw.close();
}

}

运行主类，即可。

第二部分（自定义分区函数实现）
准备mapper，reducer，test类

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCTest {

public static void main(String[] args) throws Exception{

    Configuration conf=new Configuration();
    conf.set("fs.defaultFS","file:///");
    Job job=Job.getInstance(conf);
    job.setJobName("WCTest");
 
    job.setJarByClass(WCTest.class);
    job.setInputFormatClass(TextInputFormat.class);

    args = new String[]{"file:///d:/mr/b.txt","file:///d:/mr/out"};
    FileInputFormat.addInputPath(job,new Path(args[0]));
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    job.setPartitionerClass(YearPartitioner.class);
    job.setMapperClass(WCMapper.class);
    job.setReducerClass(WCReduce.class);

    job.setNumReduceTasks(3);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.waitForCompletion(true);
}

}

准备天气的数据。

package com.MaxTemp;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

        }

    }
    fw.close();
}

}

运行主类，即可。

第三部分（hadoop采样器实现全排序）

准备mapper，reducer，Test

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**

WCTextMapper
*/
public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable>{
protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
context.write(key,value);
}
}

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**

Reducer
/
public class MaxTempReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
/*
- reduce
  */
  protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
  int max = Integer.MIN_VALUE ;
  for(IntWritable iw : values){
  max = max > iw.get() ? max : iw.get() ;
  }
  context.write(key,new IntWritable(max));
  }
  }

package com.MaxTemp;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

/**
*
*/
public class MaxTempApp {
public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");

    Job job = Job.getInstance(conf);

    //设置job的各种属性
    job.setJobName("MaxTempApp");                        //作业名称
    job.setJarByClass(MaxTempApp.class);                 //搜索类

job.setInputFormatClass(SequenceFileInputFormat.class); //设置输入格式
args = new String[]{“file:///d:/mr/seq/1.seq”,“file:///d:/mr/out”};
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));

    //将sample数据写入分区文件.
    job.setMapperClass(MaxTempMapper.class);            //mapper类
    job.setReducerClass(MaxTempReducer.class);         //reducer类
    job.setMapOutputKeyClass(IntWritable.class);        //
    job.setMapOutputValueClass(IntWritable.class);      //
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);         //
   //reduce个数
    //设置全排序分区类
    job.setPartitionerClass(TotalOrderPartitioner.class);
    //创建随机采样器对象
    //freq:每个key被选中的概率
    //numSapmple:抽取样本的总数
    //maxSplitSampled:最大采样切片数
    InputSampler.Sampler<IntWritable, IntWritable> sampler =
            new InputSampler.RandomSampler<IntWritable, IntWritable>(0.1, 6000, 3);
    job.setNumReduceTasks(3);
    	TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/mr/par.lst"));
    InputSampler.writePartitionFile(job, sampler);
  job.waitForCompletion(true);
}

}

准备seq文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.Test;

import java.io.IOException;
import java.util.Random;

public class Sequence {
@Test
public void save() throws IOException {
Configuration conf=new Configuration();
conf.set(“fs.defaultFS”,“file:///”);
FileSystem fs=FileSystem.get(conf);
//Path path=new Path(“D:/seq/1.seq”);
Path path=new Path(“D:/mr/seq/1.seq”);
//Path path=new Path(“hdfs://s200:9000/centos/hadoop”);
//SequenceFile.Writer writer=SequenceFile.createWriter(fs,conf,path,IntWritable.class,Text.class);
SequenceFile.Writer writer= SequenceFile.createWriter(fs, conf, path, IntWritable.class, IntWritable.class);
for(int x=0;x<6000;x++){
int year =1970+new Random().nextInt(100);
int temp=-37+new Random().nextInt(100);
writer.append(new IntWritable(year),new IntWritable(temp));
}
writer.close();
}
}

运行测试类

之后IDEA或者ECLIPSE下运行就可以了。
IDEA需要导入依赖或者相关包。
感兴趣的可以关注微信公众号：IT编程学习栈
扫描下方二维码关注
在这里插入图片描述

IT编程学习栈

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
Hadoop全排序原理和代码

hadoop全排序实现方法1.设置一个 reduce2.自定义分区函数3.hadoop采样器设置随机采样，切片采样抽取样本数据，计算区间值，把区间值写入分区文件，分区文件就是序列文件，此序列文件只有key，没有value，value为null第一部分（设置一个reduce）准备mapper，reducer，test类package com.MaxTemp;import org...
复制链接

扫一扫