一、重写三个方法Mapper、Reducer和Driver
(1)wordCount_FlowBean
Mapper、Reducer、FlowBean;
(2)wordCount_FlowBean&partition
Mapper、Reducer、FlowBean、partitioner;
(3)wordCount_FlowBean&partition&comparable
Mapper、Reducer、FlowBean、partitioner在FlowBean中重写compareTo方法;
二、对于Mapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
private Text outK = new Text();
private FlowBean outV = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1.获取一行
// 1 13736230513 192.196.100.1 www.atguigu.com 2481 24681 200
String line = value.toString();
//2.切割
String[] words = line.split("\t");
//3.抓取想要的数据
//手机号 上行流量 下行流量
String phone = words[1];
String up = words[words.length-3];
String down = words[words.length-2];
//4.封装
outK.set(phone);
outV.setUpFlow(Long.parseLong(up));
outV.setDownFlow(Long.parseLong(down));
outV.setSumFlow();
//5.写出
context.write(outK,outV);
}
}
三、对于Reducer
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
private FlowBean outV = new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
//1.遍历集合,累加值
long totalUp = 0;
long totaldown = 0;
for(FlowBean value : values){
totalUp += value.getUpFlow();
totaldown += value.getDownFlow();
}
//2.封装outV,outK
outV.setUpFlow(totalUp);
outV.setDownFlow(totaldown);
outV.setSumFlow();
//3.写出
context.write(key, outV);
}
}
四、对于driver
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.获取job对象
Configuration conf = new Configuration();
Job job = Job.getInstance();
//2.设置jar
job.setJarByClass(FlowDriver.class);
//3.关联mapper和Reducer
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
//4.设置mapper输出的key和value
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//5.设置最终数据输出的key和value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//6.设置数据的输入路径和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//7.提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
五、FlowBean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* 1.定义类实现writable接口
* 2.重写序列化和反序列化方法
* 3.重写空参构造
* 4.toString方法
*
* */
public class FlowBean implements Writable {
private long upFlow; //上行流量
private long downFlow; //下行流量
private long sumFlow; //总流量
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public void setSumFlow() {
this.sumFlow = this.upFlow + this.downFlow;
}
//空参构造
public FlowBean() {
}
/*
序列化和反序列化,数据顺序一一对应
*/
@Override
public void write(DataOutput out) throws IOException { //序列化
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
@Override
public void readFields(DataInput in) throws IOException { //反序列化
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.sumFlow = in.readLong();
}
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow;
}
}
6.partitioner分区
(1)Partitioner
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<Text, FlowBean> {
@Override
public int getPartition(Text text, FlowBean flowBean, int i) {
//text是手机号
String phone = text.toString();
String prePhone = phone.substring(0,3);
int partition;
if("136".equals(prePhone)){
partition = 0;
}else if("137".equals(prePhone)){
partition = 1;
}else if("138".equals(prePhone)){
partition = 2;
}else if("139".equals(prePhone)){
partition = 3;
}else{
partition = 4;
}
return partition;
}
}
(2)driver中添加如下代码
job.setPartitionerClass(ProvincePartitioner.class);
job.setNumReduceTasks(n);//n为分区数
(3)分区总结
a.如果ReduceTask的数量>getPartition的结果数,则会多产生几个空的输出文件part-1-000xx;
b.如果1<ReduceTask的数量<getPartition的结果数,则有一部分分区数据无处安放,会Exception;
c.如果ReduceTask的数量=1,则不管MapTask端输出多少个分区文件,最终结果都交给这一个
ReduceTask,最终也就只会产生一个结果文件 part-1-00000;
d.分区号必须从零开始,逐一累加;
7.camparable
FlowBean重写WritableComparable
@Override
public int compareTo(FlowBean o) {
//按照总流量比较,倒序排列
if(this.sumFlow > o.sumFlow){
return -1;
}else if(this.sumFlow < o.sumFlow){
return 1;
}else {
if(this.upFlow > o.upFlow){
return -1;
}else if (this.upFlow < o.upFlow){
return 1;
}else{
return 0;
}
}
}
参考尚硅谷Hadoop视频!原视频连接:http://www.atguigu.com/