mapreduce案例 序列化bean对象 并获取文本文件中某些字段信息并打印在hdfs相关目录
1.文本数据源已上传指百度云盘 参考资料/mapreduce中 【注意 此文本有问题 :自己需要拉几条作为数据源 原因:里面有多个tab 分割】
2.创建maven 项目 参考 mapreduce 案例
3. 创建如下几个类:
FlowBean.
FlowCountMapper
FlowCountReducer
FlowCountRunner
代码如下:
1.FlowBean.
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements WritableComparable{
private long upFlow;
private long downFlow;
private long sumFlow;
public FlowBean() {
}
public FlowBean(long upFlow, long downFlow, long sumFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = sumFlow;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
public FlowBean(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow+downFlow;
}
public void set(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow+downFlow;
}
@Override
public String toString() {
return upFlow+"\t"+downFlow+"\t"+sumFlow;
}
//序列化方法
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(sumFlow);
}
//反序列化方法
public void readFields(DataInput in) throws IOException {
this.upFlow =in.readLong();
this.downFlow =in.readLong();
this.sumFlow= in.readLong();
}
public int compareTo(Object o) {
return 0;
}
}
2.FlowCountMapper
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
Text k = new Text();
FlowBean v = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fileds = line.split("\t");//数据分隔符为tab 不是空格
String phoNum = fileds[1];
long upFlow = Long.parseLong(fileds[fileds.length-3]);
long downFlow = Long.parseLong(fileds[fileds.length-2]);
k.set(phoNum);
v.set(upFlow,downFlow);
context.write(k,v);
}
}
3.FlowCountReducer
public class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
FlowBean v= new FlowBean();
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
long upSumFlow = 0;
long downSumFlow =0;
for (FlowBean value : values) {
upSumFlow += value.getUpFlow();
downSumFlow += value.getDownFlow();
}
v.set(upSumFlow,downSumFlow);
context.write(key,v);
}
4.FlowCountRunner
public class FlowCountRunner {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf= new Configuration();
//指定mr采用本地模式运行
conf.set(“mapreduce.framework.name”,“local”);
//使用Job构建本次mr程序
Job job = Job.getInstance(conf);
//指定本次mr程序运行的主类
job.setJarByClass(FlowCountRunner.class);
//指定本地mr程序mapper reducer
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
//指定本次mr程序map阶段的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//指定本次mr程序reduce阶段的输出类型 也就是mr程序最终输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//指定本次mr程序处理的数据目录 输出的结果目录
FileInputFormat.setInputPaths(job,new Path("hdfs://note1:9000/flowsum/input"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://note1:9000/flowsum/output"));
// FileInputFormat.setInputPaths(job,new Path(“d:\flowsum\input”));
// FileOutputFormat.setOutputPath(job,new Path(“d:\flowsum\output”));
//提交本次mr的job
// job.submit();
boolean b = job.waitForCompletion(true);//提交任务 并且追踪打印job的执行情况
System.exit(b? 0:-1);
}
}
代码如上!
在Hadoop集群上运行
核心步骤: 启动hadoop集群 、项目打包、放在linux 系统中 、将文本数据源放在linux系统中 并上传至hdfs对应的目录下 最后 hadoop -jar 包
核心命令:
hadoop fs -put /root/wenben/phone.dat /flowsum/input
hadoop fs -cat /flowsum/input
hadoop fs -rm -r /flowsum/output
hadoop -jar 包名
后面还会介绍 mapreduce 的一个组件 combiner 其本身也是实现了reduce接口
combiner 是局部合并 是对每一个maptask 进行合并 ,也可以理解为 一个maptask对应有一个combiner .reduce 是将所有的combiner合并后的结果汇总。
为什么有的时候 不用combiner ?
用combiner 在数据量大的时候,时间可以缩短一半以上,但使用combiner组件之前 你要考虑combiner 是否会影响 reduce 的最终结果【也就是你的业务逻辑适不适合用combiner】。如果不影响 就可以用。比如:取数据的中间数 这个就不行。会导致最终的结果不对。
combiner是局部汇总
reduce是全局汇总