分区
分区的业务场景
在平常进行统计的时候,不仅仅会统计一个总数,可能还需要根据不同的维度进行汇总,比如SQL中的group by子句,先对数据进行分组,然后在组内进行汇总整合。
所以mapreduce提供的partitioner功能就是对数据先进行分区,然后在每个分区上在进行数据统计。
在mr中,分区是从0开始的,有多少个分区,就对应了多少个reduceTask任务(其实也就是一个线程),每个reduceTask任务会输出一个结果文件part-r-0000x
分区的几个特点
1、在MapReduce中,分区默认是从0开始依次递增
2、在MapReduce中,每一个分区需要启动对应一个ReduceTask(Reduce产生的线程),每一个ReduceTask都会对应一个结果文件
3、如果不指定分区,默认使用的分区类时HashPartitioner,也就是对键取hashcode值后,转换成正数,然后对设定的reduceTask数量取余。
默人的numReduceTasks =1,所以取余的结果一定是0,一定都在一个分区上面,最后也只会产生一个结果文件
public class HashPartitioner<K2, V2> implements Partitioner<K2, V2> {
public void configure(JobConf job) {}
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K2 key, V2 value,
int numReduceTasks) {
"求&的原因是将hashCode值转成一个整数"
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}
自动义分区类
1)继承Partitoner接口,重写分区规则。
getPartitoner返回几,就表示分到哪个分区上
2)在Driver中设置分区类
测试案例
需求:不只是按照人名的维度统计流量使用情况,同时按照 地区的维度统计流量的使用情况
分析:所以不可以直接在一起统计,而是需要根据不同的区进行分区的划分,然后通过不同的reduceTask任务进行处理。
项目没有加入分区时的编码
package flow;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Flow implements Writable {
private String phone = "";
private String city = "";
private String name = "";
private int flow;
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getFlow() {
return flow;
}
public void setFlow(int flow) {
this.flow = flow;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
//第一个写出电话
dataOutput.writeUTF(this.phone);
dataOutput.writeUTF(this.city);
dataOutput.writeUTF(this.name);
dataOutput.writeInt(this.flow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
//反序列时
this.phone = dataInput.readUTF();
this.city = dataInput.readUTF();
this.name = dataInput.readUTF();
this.flow = dataInput.readInt();
}
}
package flow;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class SerialFlowMapper extends Mapper<LongWritable, Text,Text,Flow> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] s = value.toString().split(" ");
Flow f = new Flow();
f.setPhone(s[0]);
f.setCity(s[1]);
f.setName(s[2]);
f.setFlow(Integer.parseInt(s[3]));
context.write(new Text(f.getName()),f);
}
}
package flow;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SerialFlowReducer extends Reducer<Text,Flow,Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<Flow> values, Context context) throws IOException, InterruptedException {
int flow = 0;
for(Flow f : values){
flow += f.getFlow();
}
context.write(key,new IntWritable(flow));
}
}
package flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SerialFlowDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SerialFlowDriver.class);
job.setMapperClass(SerialFlowMapper.class);
job.setReducerClass(SerialFlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Flow.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path("hdfs://hadoop01:9000/txt/flow.txt"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://hadoop01:9000/result/flow"));
job.waitForCompletion(true);
}
}
引入自定义分区
实现能够按照地区统计流量使用情况
package flow;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
//泛型:map输出的键-值 类型
public class CityPartitioner extends Partitioner<Text,Flow> {
@Override
public int getPartition(Text text, Flow flow, int numPartitions) {
String city = flow.getCity();
if("bj".equals(city))
return 0;
if("sh".equals(city))
return 1;
return 2;
}
}
修改Driver
设置分区类
设置reduceTask数量
package cn.tedu.partflow;
import cn.tedu.invert.InvertDriver;
import cn.tedu.invert.InvertMapper;
import cn.tedu.invert.InvertReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PartFlowDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PartFlowDriver.class);
job.setMapperClass(PartFlowMapper.class);
job.setReducerClass(PartFlowReducer.class);
// 指定分区类
job.setPartitionerClass(CityPartitioner.class);
// 每一个分区对应一个ReduceTask
// 现在设置了3个分区,那么就需要设置3个ReduceTask
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Flow.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,
new Path("hdfs://hadoop01:9000/txt/flow.txt"));
FileOutputFormat.setOutputPath(job,
new Path("hdfs://hadoop01:9000/result/partflow"));
job.waitForCompletion(true);
}
}
按月份统计学生的总成绩
目录:score1
Map输出的键值对:学生名,Score对象
封装一个model
分数对象
package partiScore;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Score implements Writable {
private int month;
private String name = "";
private int score;
public int getMonth() {
return month;
}
public void setMonth(int month) {
this.month = month;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getScore() {
return score;
}
public void setScore(int score) {
this.score = score;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(this.month);
dataOutput.writeUTF(this.name);
dataOutput.writeInt(this.score);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.month = dataInput.readInt();
this.name = dataInput.readUTF();
this.score = dataInput.readInt();
}
}
mapper
package partiScore;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class SumMapper extends Mapper<LongWritable, Text,Text,Score> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1 zhang 89
String[] s = value.toString().split(" ");
Score score = new Score();
score.setMonth(Integer.parseInt(s[0]));
score.setName(s[1]);
score.setScore(Integer.parseInt(s[2]));
context.write(new Text(score.getName()),score);
}
}
reducer
package partiScore;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SumReducer extends Reducer<Text,Score, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<Score> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(Score s:values){
sum+=s.getScore();
}
context.write(key,new IntWritable(sum));
}
}
Partitioner
package partiScore;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MonthPartitioner extends Partitioner<Text,Score> {
@Override
public int getPartition(Text text, Score score, int numPartitions) {
return score.getMonth()-1;
}
}
Driver
package partiScore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SumDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job= Job.getInstance(new Configuration());
job.setJarByClass(SumDriver.class);
job.setMapperClass(SumMapper.class);
job.setReducerClass(SumReducer.class);
job.setPartitionerClass(MonthPartitioner.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Score.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path("hdfs://hadoop01:9000/txt/score1/"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://hadoop01:9000/result/partMonthScore"));
job.waitForCompletion(true);
}
}