hadoop的序列化和反序列化
-
Java 的序列化(Serializable)是一个重量级序列化框架,一个对象被序列化后,会附带很多额外的信息(各种校验信息,header,继承体系…),不便于在网络中高效传输;所以要用hadoop 的序列化机制(Writable),精简,高效。不用像 java 对象类一样传输多层的父子关系,需要哪个属性就传输哪个属性值,大大的减少网络传输的开销。
-
Writable是Hadoop的序列化格式,hadoop定义了这样一个Writable接口。 一个类要支持可序列化只需实现这个接口即可。
-
另外Writable有一个子接口是WritableComparable,writableComparable是既可实现序列化,也可以对key进行比较,我们这里可以通过自定义key实现WritableComparable来实现我们的排序功能
在Hadoop框架内部传递一个bean对象,那么该对象就需要实现序列化接口。
(1)必须实现Writable接口
(2)反序列化时,需要反射调用空参构造函数,所以必须有空参构造
DEMO
- 需求:现有用户手机上网详单数据,求取每个手机号的上行包个数之和、下行包个数之和,以及上行总流量之和、下行总流量之和
代码实现
- 定义javaBean对象:
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//序列化与反序列化
public class FlowBean implements Writable {
//上行包个数
private Integer upPackNum;
//下行包个数
private Integer downPackNum;
//上行总流量
private Integer upPayLoad;
//下行总流量
private Integer downPayLoad;
//反序列话的时候要用到
public FlowBean() {
}
@Override
public void write(DataOutput out) throws IOException {
//调用序列化方法时,要用与类型匹配的write方法
//记住序列化的顺序
out.writeInt(upPackNum);
out.writeInt(downPackNum);
out.writeInt(upPayLoad);
out.writeInt(downPayLoad);
}
@Override
public void readFields(DataInput in) throws IOException {
//发序列话的顺序要与序列化保持一直
//使用的方法类型要匹配
this.upPackNum = in.readInt();
this.downPackNum = in.readInt();
this.upPayLoad = in.readInt();
this.downPayLoad = in.readInt();
}
public Integer getUpPackNum() {
return upPackNum;
}
public Integer getDownPackNum() {
return downPackNum;
}
public Integer getUpPayLoad() {
return upPayLoad;
}
public Integer getDownPayLoad() {
return downPayLoad;
}
public void setUpPackNum(Integer upPackNum) {
this.upPackNum = upPackNum;
}
public void setDownPackNum(Integer downPackNum) {
this.downPackNum = downPackNum;
}
public void setUpPayLoad(Integer upPayLoad) {
this.upPayLoad = upPayLoad;
}
public void setDownPayLoad(Integer downPayLoad) {
this.downPayLoad = downPayLoad;
}
@Override
public String toString() {
return "FlowBean{" +
"upPackNum=" + upPackNum +
", downPackNum=" + downPackNum +
", upPayLoad=" + upPayLoad +
", downPayLoad=" + downPayLoad +
'}';
}
}
- 定义mapper类:
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
private FlowBean flowBean;
private Text text;
//初始化的动作,可以写在这个方法里;一个map task在开始运行前只执行一次
@Override
protected void setup(Context context) throws IOException, InterruptedException {
flowBean = new FlowBean();
text = new Text();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com
// 游戏娱乐 24 27 2481 24681 200
String[] split = value.toString().split("\t");
String phoneNum = split[1];
//上行包个数
String upPackNum = split[6];
//下行包个数
String downPackNum = split[7];
//上行总流量
String upPayLoad = split[8];
//下行总流量
String downPayLoad = split[9];
text.set(phoneNum);
flowBean.setUpPackNum(Integer.parseInt(upPackNum));
flowBean.setDownPackNum(Integer.parseInt(downPackNum));
flowBean.setUpPayLoad(Integer.parseInt(upPayLoad));
flowBean.setDownPayLoad(Integer.parseInt(downPayLoad));
context.write(text, flowBean);
}
}
- 定义reducer类:
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text, FlowBean, Text, Text> {
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
//上行包个数
int upPackNum = 0;
//下行包个数
int downPackNum = 0;
//上行总流量
int upPayLoad = 0;
//下行总流量
int downPayLoad = 0;
for (FlowBean value : values) {
upPackNum += value.getUpPackNum();
downPackNum += value.getDownPackNum();
upPayLoad += value.getUpPayLoad();
downPayLoad += value.getDownPayLoad();
}
context.write(key, new Text(upPackNum + "\t" + downPackNum + "\t" + upPayLoad + "\t" + downPayLoad));
}
}
- 定义main方法
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FlowMain extends Configured implements Tool {
/**
* 两个参数
* C:\Users\admin\Desktop\高级06\Hadoop\MapReduce&YARN\MR第一次\2、hadoop的序列化\数据\input
* C:\Users\admin\Desktop\高级06\Hadoop\MapReduce&YARN\MR第一次\2、hadoop的序列化\output
* @param args
* @return
* @throws Exception
*/
@Override
public int run(String[] args) throws Exception {
//获取job对象
Job job = Job.getInstance(super.getConf(), FlowMain.class.getSimpleName());
//如果程序打包运行必须要设置这一句
job.setJarByClass(FlowMain.class);
job.setInputFormatClass(TextInputFormat.class);
//TextInputFormat.addInputPath(job, new Path("file:///C:\\Users\\admin\\Desktop\\高级05\\MR第一次\\2、hadoop的序列化\\数据\\input"));
TextInputFormat.addInputPath(job, new Path(args[0]));
job.setMapperClass(FlowMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setReducerClass(FlowReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
//TextOutputFormat.setOutputPath(job, new Path("file:///C:\\Users\\admin\\Desktop\\高级05\\MR第一次\\2、hadoop的序列化\\数据\\out"));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
boolean b = job.waitForCompletion(true);
return b ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration, new FlowMain(), args);
System.exit(run);
}
}