一、排序
Text IntWritable等自带比较规则
hadoop提供了比较器,可以重新定义排序规则
方法:继承Text、IntWritable类中的Comparator类,并重写compare方法
将wordcount中的结果按字典顺序倒序,重新排序:
import org.apache.hadoop.io.Text;
/*
重写Text的比较方法,
将结果按字典序倒序
*/
public class ReverseWordComparator extends Text.Comparator {
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -super.compare(b1, s1, l1, b2, s2, l2);
}
}
将wordcount中的结果按长度排序,如果长度相同,按字符大小排序:
import org.apache.hadoop.io.Text;
/*
重写Text的比较方法
按字符串的长度排序
如果长度相同,按字符大小排序(比如abc和cba大小相同)
*/
public class LengthComparator extends Text.Comparator{
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
String str1=new String(b1,s1+1,l1-1);
String str2=new String(b2,s2+1,l2-1);
if(str1.length()==str2.length()){
return getSize(str1)-getSize(str2);
}else{
return str1.length()-str2.length();
}
}
public int getSize(String s){
char[] chars=s.toCharArray();
int sum=0;
for(char ch:chars){
sum+=ch;
}
return sum;
}
}
二、分区
通过分区可以按照需求将不同的记录保存到不同的结果文件中。
分区规则可以指定,默认使用hash,基于Key进行的
按业务逻辑划分时,要注意负载均衡
方法:编写一个类继承 Partitioner 类并重写 getPartition 方法,再通过job.setNumReduceTasks()和job.setPartitionerClass()设置分区规则和reduce数量
reduce的数量和分区的数量的关系:
reduce的数量最好和分区的数量相同
如果只有一个reduce程序也可以运行,但无意义
reduce的数量可以大于分区的数量,但有的reduce会分不到数据,产生空白的文件输出
reduce的数量不可以小于分区的数量,程序会无法执行
①相同的手机号段放在同一个文件中:
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/*
将手机号段相同的号码放在同一个文件中
自定义业务逻辑分区时,注意负载均衡的问题
*/
public class PhonePartitioner extends Partitioner<Text,Flow>{
public int getPartition(Text text, Flow flow, int numPartitions) {
String phoneNum=text.toString();
String firstSecondNum=phoneNum.substring(0,2);
if("13".equals(firstSecondNum)){
return 0;
}else if("15".equals(firstSecondNum)){
return 1;
}else if("18".equals(firstSecondNum)){
return 2;
}else{
return 3;
}
}
}
在通过job设置:
job.setNumReduceTasks(4);//分成四个区,对应四个输出文件
job.setPartitionerClass(PhonePartitioner.class);
②将wordcount的结果,按首字母分成四个文件输出:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class WordPartitioner extends Partitioner<Text,IntWritable> {
public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
String word = text.toString();
char first;
System.err.println(word.length());
if(word.length()==0){
first=' ';
}else {
first = word.charAt(0);
}
if ((first >= 'a' && first < 'g') || (first >= 'A' && first < 'G')) {
return 0;
} else if ((first >= 'g' && first < 'o') || (first >= 'G' && first < 'O')) {
return 1;
} else if ((first >= 'o' && first < 'u') || (first >= 'O' && first < 'U')) {
return 2;
} else {
return 3;
}
}
}
再通过job设置:
job.setNumReduceTasks(4);
job.setPartitionerClass(WordPartitioner.class);
三、计数器
分为两种:内置的:如下
自定义的:
定义方法:①Counter counter=context.getCounter(groupName,counterName);
②使用枚举
public enum MC{
ERROR
}
Counter error=context.getCounter(MC.ERROR);
常用方法:counter.increment();
job.getCounters().getGroup(groupName).iterator();
四、Combiner
和reduce的使用方法一样,但是combiner运行的位置和reduce不同,combiner运行在map端,相当于多做了一次reduce,可以提高性能,减少map对reduce的输出。
另外,因为combiner的输入是map输出,combiner的输出是reduce的输入,所以combiner的输入输出一定是相同的!
方法:编写一个类继承Reducer类,重写reduce方法,再通过job.setCombinerClass()设置
例:求各月的温度和
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class ForSumTemp {
public static class ForMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
private Text oKey=new Text();
private IntWritable oValue=new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line=value.toString();
String []strs=line.split(" ");
oKey.set(strs[0]);
oValue.set(Integer.parseInt(strs[1]));
context.write(oKey,oValue);
}
}
public static class ForCombiner extends Reducer<Text,IntWritable,Text,IntWritable>{
private IntWritable oValue=new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for(IntWritable i:values){
sum+=i.get();
}
oValue.set(sum);
context.write(key,oValue);
}
}
public static class ForReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for(IntWritable i:values){
sum+=i.get();
}
context.write(key,new IntWritable(sum));
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Job job=Job.getInstance();
job.setMapperClass(ForMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(ForReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setCombinerClass(ForCombiner.class);
FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
Path path=new Path("E://output");
if(fileSystem.exists(path)){
fileSystem.delete(path,true);
}
FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\forCombiner"));
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}
可以从上述代码的运行结果看出Combiner的效果:
使用Combiner:
不使用Combiner:
可以看出reduce的输入数据减少了,因为在combiner的时候已经合并过一次了~
例:求各月的平均温度
自定义一个实体类,记录气温总和和天数
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class SumAndDay implements Writable{
private double sumTemp;
private int days;
public SumAndDay(){
}
public SumAndDay(double sumTemp, int days) {
this.sumTemp = sumTemp;
this.days = days;
}
public double getSumTemp() {
return sumTemp;
}
public void setSumTemp(double sumTemp) {
this.sumTemp = sumTemp;
}
public int getDays() {
return days;
}
public void setDays(int days) {
this.days = days;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeDouble(sumTemp);
dataOutput.writeInt(days);
}
public void readFields(DataInput dataInput) throws IOException {
sumTemp=dataInput.readDouble();
days=dataInput.readInt();
}
}
计算平均温度:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
public class ForAverTemp {
public static class ForMapper extends Mapper<LongWritable,Text,Text,SumAndDay>{
private Text oKey=new Text();
private SumAndDay oValue=new SumAndDay();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line=value.toString();
String []strs=line.split(" ");
oKey.set(strs[0]);
oValue.setDays(1);
oValue.setSumTemp(Integer.parseInt(strs[1]));
context.write(oKey,oValue);
}
}
public static class ForCombiner extends Reducer<Text,SumAndDay,Text,SumAndDay>{
private SumAndDay oValue=new SumAndDay();
@Override
protected void reduce(Text key, Iterable<SumAndDay> values, Context context) throws IOException, InterruptedException {
int day=0;
int sum=0;
for(SumAndDay sad:values){
day+=sad.getDays();
sum+=sad.getSumTemp();
}
oValue.setDays(day);
oValue.setSumTemp(sum);
context.write(key,oValue);
}
}
public static class ForReducer extends Reducer<Text,SumAndDay,Text,DoubleWritable>{
@Override
protected void reduce(Text key, Iterable<SumAndDay> values, Context context) throws IOException, InterruptedException {
int day=0;
int sum=0;
for(SumAndDay sad:values){
day+=sad.getDays();
sum+=sad.getSumTemp();
}
//保留两位小数
DecimalFormat df = new DecimalFormat("#.00");
double averTemp=Double.parseDouble(df.format((sum*1.0)/day));
context.write(key,new DoubleWritable(averTemp));
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Job job=Job.getInstance();
job.setMapperClass(ForMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(SumAndDay.class);
job.setReducerClass(ForReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setCombinerClass(ForCombiner.class);
FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
Path path=new Path("E://output");
if(fileSystem.exists(path)){
fileSystem.delete(path,true);
}
FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\forCombiner"));
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}