思维导图
hadoop排序特点
实例:
- 输入
- 编写类
- 结果
排序概述
Hadoop排序的特点
-
Map Task和Reduce Task均会对数据(按照key)进行排序。
-
默认排序是按照字典顺序,且实现该排序的方法是快速排序
什么时候对数据进行排序?
对于Map Task,它会将处理的结果暂时放到一个缓冲区中,当缓冲区使用率达到一定阈值后,再对缓冲区中的数据进行一次排序,并将这些有序数据写到磁盘上,而当数据处理完毕后,它会对磁盘上所有文件进行一次合并,以将这些文件合并成一个大的有序文件。
对于Reduce Task,它从每个Map Task上远程拷贝相应的数据文件,如果文件大小超过一定阈值,则放到磁盘上,否则放到内存中。如果磁盘上文件数目达到一定阈值,则进行一次合并以生成一个更大文件;如果内存中文件大小或者数目超过一定阈值,则进行一次合并后将数据写到磁盘上。当所有数据拷贝完毕后,Reduce Task统一对内存和磁盘上的所有数据进行一次合并。
Person类实例
Person类
package com.dev1.debug;
public class Person implements Comparable<Person>{//实现比较接口
private String name;
private int age;
private int yanzhi;
public Person(String name, int age, int yanzhi) {
this.name = name;
this.age = age;
this.yanzhi = yanzhi;
}
@Override
public String toString() {
return "Person{" +
"name='" + name + '\'' +
", age=" + age +
", yanzhi=" + yanzhi +
'}';
}
@Override
public int compareTo(Person o) {
if(this.age>o.age){
return 1;//大于零为升序
} else if (this.age < o.age) {
return -1;
}else{
return 0;
}
}
}
二次排序
package com.dev1.debug;
public class Person implements Comparable<Person>{//实现比较接口
private String name;
private int age;
private int yanzhi;
public Person(String name, int age, int yanzhi) {
this.name = name;
this.age = age;
this.yanzhi = yanzhi;
}
@Override
public String toString() {
return "Person{" +
"name='" + name + '\'' +
", age=" + age +
", yanzhi=" + yanzhi +
'}';
}
@Override
public int compareTo(Person o) {
if(this.age>o.age){
return 1;//大于零为升序
} else if (this.age < o.age) {
return -1;
}else{
if(this.yanzhi>o.yanzhi){
return 1;//大于零为升序
} else if (this.yanzhi < o.yanzhi) {
return -1;
}
return 0;
}
}
}
PersonDemo类
package com.dev1.debug;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
public class PersonDemo {
public static void main(String[] args) {
Person p1=new Person("张三",20,80);
Person p2=new Person("李四",25,95);
Person p3=new Person("王五",25,90);
List<Person> list =new ArrayList<Person>();
list.add(p1);
list.add(p2);
list.add(p3);
for(Person p:list){
System.out.println(p);
}
System.out.println("--------------------------");
Collections.sort(list);//使用集合工具类进行排序
for(Person p:list){
System.out.println(p);
}
System.out.println("--------------------------");
}
}
排序实例
输入
13736230513 2481 24681 27162
13846544121 264 0 264
13956435636 132 1512 1644
13966251146 240 0 240
18271575951 1527 2106 3633
FlowBean类
package com.dev1.sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;
private long downFlow;
private long sumFlow;
public FlowBean() {
}
public FlowBean(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow+downFlow;
}
public long getUpFlow() {
return upFlow;
}
public long getDownFlow() {
return downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
@Override
public String toString() {
return upFlow +
" "+ downFlow +
" "+ sumFlow;
}
@Override
public int compareTo(FlowBean o) {
// 如果需要升序则使用前者减后者,反之后者减前者
return (int) (o.getSumFlow() - this.sumFlow);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
}
FlowCountSortMapper类
package com.dev1.sort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> {
FlowBean bean = new FlowBean();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 获取一行
String line = value.toString();
// 2 截取
String[] fields = line.split("\t");
// 3 封装对象
String phoneNbr = fields[0];
long upFlow = Long.parseLong(fields[1]);
long downFlow = Long.parseLong(fields[2]);
bean.setUpFlow(upFlow);
bean.setDownFlow(downFlow);
bean.setSumFlow(upFlow+downFlow);
v.set(phoneNbr);
// 4 输出
context.write(bean, v);
}
}
FlowCountSortReducer类
package com.dev1.sort;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> {
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// 输出格式为 手机号 流量数据
for (Text text : values) {
context.write(text, key);
}
}
}
FlowCountSortDriver类
package com.dev1.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowCountSortDriver {
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
// 输入输出路径需要根据自己电脑上实际的输入输出路径设置
// args = new String[]{"e:/output1","e:/output2"};
// 1 获取配置信息,或者job对象实例
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
// 2 指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCountSortDriver.class);
// 3 指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(FlowCountSortMapper.class);
job.setReducerClass(FlowCountSortReducer.class);
// 4 指定mapper输出数据的kv类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
// 5 指定最终输出的数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
// 6 指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// // 加载自定义分区类
// job.setPartitionerClass(ProvincePartitioner.class);
//
// // 设置Reducetask个数
// job.setNumReduceTasks(5);
// 7 将job中配置的相关参数,以及job所用的java类所在的jar包, 提交给yarn去运行
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
ProvincePartitioner类
package com.dev1.sort;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<FlowBean, Text> {
@Override
public int getPartition(FlowBean key, Text value, int numPartitions) {
// 1 获取手机号码前三位
String preNum = value.toString().substring(0, 3);
int partition = 4;
// 2 根据手机号归属地设置分区
if ("136".equals(preNum)) {
partition = 0;
}else if ("137".equals(preNum)) {
partition = 1;
}else if ("138".equals(preNum)) {
partition = 2;
}else if ("139".equals(preNum)) {
partition = 3;
}else{
partition = 4;
}
return partition;
}
}
输出
13736230513 2481 24681 27162
18271575951 1527 2106 3633
13956435636 132 1512 1644
13846544121 264 0 264
13966251146 240 0 240