背景:
序列化:
序列化是将对象的状态信息(静态属性、变量)转化为存储或运输的形式的过程
反序列化:
字符节或者xml格式可以转化成完全相等的对象,这个相反的过程称为反序列化
基本序列化类型往往不能满足所有需求,比如在Hadoop框架内部传递一个自定义bean对象,那么该对象就需要实现Writable序列化接口。
工具:
序列化Writable接口
1.构建序列化对象Bean
package speak;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class SpeakBean implements Writable {
//定义属性
private Long selfDuration;
private Long thirdPartDuration;
private String deviceid;
private Long sumDuration;
//构造无参函数
public SpeakBean() {
}
//构造带参函数
public SpeakBean(Long selfDuration, Long thirdPartDuration, String deviceid) {
this.selfDuration = selfDuration;
this.thirdPartDuration = thirdPartDuration;
this.deviceid = deviceid;
this.sumDuration = this.selfDuration + this.thirdPartDuration;
}
public Long getSelfDuration() {
return selfDuration;
}
public void setSelfDuration(Long selfDuration) {
this.selfDuration = selfDuration;
}
public Long getThirdPartDuration() {
return thirdPartDuration;
}
public void setThirdPartDuration(Long thirdPartDuration) {
this.thirdPartDuration = thirdPartDuration;
}
public String getDeviceid() {
return deviceid;
}
public void setDeviceid(String deviceid) {
this.deviceid = deviceid;
}
public Long getSumDuration() {
return sumDuration;
}
public void setSumDuration(Long sumDuration) {
this.sumDuration = sumDuration;
}
//序列化方法
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(selfDuration);
out.writeLong(thirdPartDuration);
out.writeUTF(deviceid);
out.writeLong(sumDuration);
}
//反序列方法
@Override
public void readFields(DataInput in) throws IOException {
this.selfDuration = in.readLong();
this.thirdPartDuration = in.readLong();
this.deviceid = in.readUTF();
this.sumDuration = in.readLong();
}
//这步必须写为后续转化字符串切割提取数据做准备
@Override
public String toString() {
return selfDuration +
"\t" + thirdPartDuration +
"\t" + deviceid + "\t" + sumDuration;
}
}
2.构建Mapper类
package speak;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//第一队kv:map输入参数的kv类型
public class speakmapper extends Mapper<LongWritable, Text,Text,SpeakBean> {
Text device_id = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, SpeakBean>.Context context) throws IOException, InterruptedException {
//注意数据类型,对象一定要对齐
String line = value.toString();
String[] fields = line.split("\t");
String selfDuration = fields[fields.length - 3];
String thirdPartDuration = fields[fields.length - 2];
String deviceid = fields[1];
SpeakBean bean = new SpeakBean(Long.parseLong(selfDuration), Long.parseLong(thirdPartDuration), deviceid);
device_id.set(deviceid);
context.write(device_id,bean);
}
}
3.构建Reducer类
package speak;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SpeakReducer extends Reducer<Text,SpeakBean, Text,SpeakBean> {
Long self_duration = 0L;
Long third_part_duration = 0L;
@Override
protected void reduce(Text key, Iterable<SpeakBean> values, Reducer<Text, SpeakBean, Text, SpeakBean>.Context context) throws IOException, InterruptedException {
for (SpeakBean bean : values) {
Long selfDuration = bean.getSelfDuration();
Long thirdPartDuration = bean.getThirdPartDuration();
self_duration += selfDuration;
third_part_duration += thirdPartDuration;
}
SpeakBean bean = new SpeakBean(self_duration, third_part_duration, key.toString());
//写入文本
context.write(key,bean);
}
}
注意Reducer类的kv类型一定要与Mapper类对齐
4.构建Driver类
package speak;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SpeakDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "SpeakDriver");
//调用Driver类
job.setJarByClass(SpeakDriver.class);
//调用Mapper,Reducer类
job.setMapperClass(speakmapper.class);
job.setReducerClass(SpeakReducer.class);
//调用Mapper,Reducer输出的kv
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(SpeakBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(SpeakBean.class);
//读入本地文件
FileInputFormat.setInputPaths(job,new Path("E:/bigdata/data/speak.data"));
//将文件创建输出结果
FileOutputFormat.setOutputPath(job,new Path("E:/output1/"));
boolean flag = job.waitForCompletion(true);
System.exit(flag?0:1);
}
}
拓展:
对结果数据进行排序
在序列化对象调用Comparble函数
本函数顺序是由大到小
相反则值域相反
@Override
public int compareTo(SpeakBean o) {
if(this.sumDuration>o.sumDuration){
return -1;
} else if (this.sumDuration<o.sumDuration) {
return 1;
}else return 0;
}
同时Reuducer类也进行相应改变
因为我们在排序时调用Comparble函数时把相同大小数据归为一类key导致数据重叠
所以需用迭代器value区别值
package Sort;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SortReducer extends Reducer<SpeakBean, NullWritable,SpeakBean,NullWritable> {
@Override
protected void reduce(SpeakBean key, Iterable<NullWritable> values, Reducer<SpeakBean, NullWritable, SpeakBean, NullWritable>.Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
context.write(key,value);
}
}
}