mapreduce案例序列化bean对象并获取文本文件中某些字段信息并打印在hdfs相关目录

本文链接：https://blog.csdn.net/zengxianglei/article/details/89220350

mapreduce案例序列化bean对象并获取文本文件中某些字段信息并打印在hdfs相关目录

1.文本数据源已上传指百度云盘参考资料/mapreduce中【注意此文本有问题：自己需要拉几条作为数据源原因：里面有多个tab 分割】

2.创建maven 项目参考 mapreduce 案例
3. 创建如下几个类：
FlowBean.
FlowCountMapper
FlowCountReducer
FlowCountRunner
代码如下：
1.FlowBean.
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FlowBean implements WritableComparable{
private long upFlow;
private long downFlow;
private long sumFlow;

public FlowBean() {
}

public FlowBean(long upFlow, long downFlow, long sumFlow) {
    this.upFlow = upFlow;
    this.downFlow = downFlow;
    this.sumFlow = sumFlow;
}

public long getUpFlow() {
    return upFlow;
}

public void setUpFlow(long upFlow) {
    this.upFlow = upFlow;
}

public long getDownFlow() {
    return downFlow;
}

public void setDownFlow(long downFlow) {
    this.downFlow = downFlow;
}

public long getSumFlow() {
    return sumFlow;
}

public void setSumFlow(long sumFlow) {
    this.sumFlow = sumFlow;
}
public FlowBean(long upFlow, long downFlow) {
    this.upFlow = upFlow;
    this.downFlow = downFlow;
    this.sumFlow = upFlow+downFlow;
}

public void set(long upFlow, long downFlow) {
    this.upFlow = upFlow;
    this.downFlow = downFlow;
    this.sumFlow = upFlow+downFlow;
}



@Override
public String toString() {
    return upFlow+"\t"+downFlow+"\t"+sumFlow;
}

//序列化方法

public void write(DataOutput out) throws IOException {
    out.writeLong(upFlow);
    out.writeLong(downFlow);
    out.writeLong(sumFlow);
}

//反序列化方法

public void readFields(DataInput in) throws IOException {
    this.upFlow =in.readLong();
    this.downFlow =in.readLong();
    this.sumFlow= in.readLong();
}


public int compareTo(Object o) {
    return 0;
}

}

2.FlowCountMapper
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
Text k = new Text();
FlowBean v = new FlowBean();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    String[] fileds = line.split("\t");//数据分隔符为tab 不是空格
    String phoNum = fileds[1];
    long upFlow = Long.parseLong(fileds[fileds.length-3]);
    long downFlow = Long.parseLong(fileds[fileds.length-2]);
    k.set(phoNum);
    v.set(upFlow,downFlow);
    context.write(k,v);
}

}
3.FlowCountReducer
public class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
FlowBean v= new FlowBean();
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
long upSumFlow = 0;
long downSumFlow =0;
for (FlowBean value : values) {
upSumFlow += value.getUpFlow();
downSumFlow += value.getDownFlow();
}
v.set(upSumFlow,downSumFlow);
context.write(key,v);
}

4.FlowCountRunner
public class FlowCountRunner {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf= new Configuration();
//指定mr采用本地模式运行
conf.set(“mapreduce.framework.name”,“local”);
//使用Job构建本次mr程序
Job job = Job.getInstance(conf);

    //指定本次mr程序运行的主类
    job.setJarByClass(FlowCountRunner.class);

    //指定本地mr程序mapper  reducer
    job.setMapperClass(FlowCountMapper.class);
    job.setReducerClass(FlowCountReducer.class);

    //指定本次mr程序map阶段的输出类型
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(FlowBean.class);

    //指定本次mr程序reduce阶段的输出类型 也就是mr程序最终输出
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FlowBean.class);
    //指定本次mr程序处理的数据目录 输出的结果目录
    FileInputFormat.setInputPaths(job,new Path("hdfs://note1:9000/flowsum/input"));
    FileOutputFormat.setOutputPath(job,new Path("hdfs://note1:9000/flowsum/output"));

// FileInputFormat.setInputPaths(job,new Path(“d:\flowsum\input”));
// FileOutputFormat.setOutputPath(job,new Path(“d:\flowsum\output”));

    //提交本次mr的job

// job.submit();
boolean b = job.waitForCompletion(true);//提交任务并且追踪打印job的执行情况
System.exit(b? 0:-1);

}

代码如上！
在Hadoop集群上运行
核心步骤：启动hadoop集群、项目打包、放在linux 系统中、将文本数据源放在linux系统中并上传至hdfs对应的目录下最后 hadoop -jar 包

核心命令：
hadoop fs -put /root/wenben/phone.dat /flowsum/input
hadoop fs -cat /flowsum/input
hadoop fs -rm -r /flowsum/output
hadoop -jar 包名

后面还会介绍 mapreduce 的一个组件 combiner 其本身也是实现了reduce接口
combiner 是局部合并是对每一个maptask 进行合并，也可以理解为一个maptask对应有一个combiner .reduce 是将所有的combiner合并后的结果汇总。
为什么有的时候不用combiner ?
用combiner 在数据量大的时候，时间可以缩短一半以上，但使用combiner组件之前你要考虑combiner 是否会影响 reduce 的最终结果【也就是你的业务逻辑适不适合用combiner】。如果不影响就可以用。比如：取数据的中间数这个就不行。会导致最终的结果不对。
combiner是局部汇总
reduce是全局汇总