Hadoop Partitioner分区器 和 shuffle 二次排序分区的利用
1.创建一个实体类
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;
public class SecondSortData implements WritableComparable<SecondSortData> {
private String first;
private Integer second;
public SecondSortData() {
}
public SecondSortData(String first, Integer second) {
this.first = first;
this.second = second;
}
public String getFirst() {
return first;
}
public void setFirst(String first) {
this.first = first;
}
public Integer getSecond() {
return second;
}
public void setSecond(Integer second) {
this.second = second;
}
@Override
public int compareTo(SecondSortData o) {
//如果当前对象与o对象first属性相同,需要比较second属性
//1.第一类升序
int result=this.getFirst().compareTo(o.getFirst());
if (result==0){
//2.第二列降序
result=-this.getSecond().compareTo(o.getSecond());
}
return result;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(first);
dataOutput.writeInt(second);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.first=dataInput.readUTF();
this.second=dataInput.readInt();
}
2.写具体的方法 (创建一个类 这里类名为:SecondSort ):
注:包一定要到入正确
import mapreduce.model.SecondSortData;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
2.1 在本地运行时需要配置HADOOP_HOME环境变量(需重启软件或机器)或
配置临时环境变量:
//本地运行方法
static {
System.setProperty("hadoop.home.dir","D:\\soft\\hadoop\\hadoop-2.9.2");
}
2.2 创建一个类去继承Mapper 并重写map方法(泛型类型根据题意)
public static class MyMapper extends Mapper<LongWritable, Text, SecondSortData, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] lineArr = value.toString().split("\\s");
context.write(new SecondSortData(lineArr[0],Integer.parseInt(lineArr[1])),new
IntWritable(Integer.parseInt(lineArr[1])));
}
}
2.3 Partitioner分区器 :将map的结果发送到相应的reduce。负载均衡,尽量将数据均匀的分配到不同的reduce。
//自定义分区器
public static class MyPartitionser extends Partitioner<SecondSortData,IntWritable> {
//按照map输出的key中的第一个属性进行分区
@Override
public int getPartition(SecondSortData secondSortData, IntWritable intWritable, int numPartitions) {
return secondSortData.getFirst().hashCode() % numPartitions;
}
}
2.4 reduce聚合计算之前的分组比较
//reduce聚合计算之前的分组比较
public static class MyGroupComparator extends WritableComparator{
protected MyGroupComparator(){
super(SecondSortData.class,true);
}
//按照map输出的key中第一个属性进行比较
@Override
public int compare(WritableComparable a, WritableComparable b) {
return super.compare(a, b);
}
}
2.5创建一个类去继承Reducer 并重写reduce方法(泛型类型根据题意)
public static class MyReducer extends Reducer<SecondSortData,IntWritable,Text,IntWritable>{
@Override
protected void reduce(SecondSortData key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
for (IntWritable value:values
) {
context.write(new Text(key.getFirst()),value);
}
}
}
2.6 驱动方法:
//mian驱动方法
public static void main(String[] args) throws Exception {
//初始化一个job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "seoncd_sort");
//jar包集群运行
//job.setJarByClass(SecondSort.class);
//输入文件
FileInputFormat.addInputPath(job,new Path(args[0]));
//map并行计算
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(SecondSortData.class);
job.setOutputValueClass(IntWritable.class);
//shuffle流程(内部实现)
job.setPartitionerClass(MyPartitionser.class);
job.setGroupingComparatorClass(MyGroupComparator.class);
//reduce计算
//设置reduce个数
job.setNumReduceTasks(2);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//输出文件
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//判断该输出文件目录是否存在 如存在 则删除!
FileSystem fs=FileSystem.get(conf);
if (fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]),true);
}
//提交作业(总入口)
boolean b = job.waitForCompletion(true);
System.out.println(b ? 1:0);
}
本地运行方法:
1.先运行一下 否则不显示类型 无法进入
2.配置文件地址 和 输出地址
3.点击运行即可
打jar包放到hadoop集群中运行
1.打jar包
1.1加入代码 (上述代码已经加入):
job.setJarByClass(xxx.class);
2.上传到linux系统中合适的位置
2.1我在此使用的rz上传 上传了所需要的XXXX.jar包和测试文件word.txt
3. 运行
3.1执行命令:
bin/yarn jar xxx.jar hdfs.WordCount file:xxxx/word.txt file:xxx/out