有时候需要将处理完的数据分类储存,则需要自定义partitional方法
mapreduce运行过程:
1.hadoop读取文件传参数给map方法;
2.map方法产生KV对发送给partitioner方法;
3.partitioner方法分类给不同的Reduce 进程;
4.Reduce方法将产生的KV对进行处理;
5.Hadoop将不同进程的Reduce方法产生的数据保存在不同文件;
main函数
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* main函数的args要求两个参数:1.源文件位置;2.输出文件数据
* Created by hadoop on 17-2-18.
*/
public class JobSubmitter {
public static void main(String[] args) throws Exception {
if(args.length<2)
{
System.out.println("输入参数不正确");
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(ProvinceFlowCountMapper.class);
job.setReducerClass(ProvinceFlowCountReducer.class);
//当map的输出类型和reduce或者叫最终输出类型相同是下面两行可以不用写
// job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置自定义的partitioner,替换系统默认的Hashpartitioner
job.setPartitionerClass(ProvincePartitioner.class);
//设置reduce的数量,要和partitional 中设置的数量相匹配,ProvincePartitioner输出了0或者1
//如果ReduceTasks数量多于Partitioner中设置的数量,则多出来的ReduceTask产生的文件为空
//如果ReduceTasks数量少于Partitioner中设置的数量,则会报错
//但是如果ReduceTasks数量为1时,不会报错,但是会收集所有数据,即不存在分区
job.setNumReduceTasks(2);
//使用命令行参数更加灵活
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}
map类
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 统计手机流量
* 数据:ID(int) 手机号(varchar) 流量(int)
* Created by hadoop on 17-2-18.
*/
public class ProvinceFlowCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
//创建私有成员减少垃圾回收
private Text phone = null;
private IntWritable flow = null;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
this.phone = new Text(fields[1]);
this.flow = new IntWritable(Integer.parseInt(fields[2]));
//这样写效率较低
// IntWritable flow = new IntWritable(Integer.parseInt((fields[2]));
context.write(phone,flow);
}
}
Reduce类
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* Created by hadoop on 17-2-18.
*/
public class ProvinceFlowCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int flowSum = 0;
for(IntWritable value:values)
{
flowSum += value.get();
}
context.write(key,new IntWritable(flowSum));
}
}
Partitioner类
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 给map函数出来的KV对进行分类,每一个对都会调用一次
* Created by hadoop on 17-2-18.
*/
public class ProvincePartitioner extends Partitioner<Text,IntWritable> {
@Override
public int getPartition(Text text, IntWritable intWritable, int i) {
String phone = text.toString();
phone = phone.substring(phone.length()-1,phone.length());
int charge = Integer.parseInt(phone)%2;
return charge;
}
}