第一步,说需求和逻辑
统计手机用户的上行和下行以及汇总的手机流量,示例数据见sample文件
实现思路:
map:
输入:key为行的偏移量(其实就是拆分每一行数据得到的数组的第一个元素)。value为剩余的数组元素
输出:key为手机号,value是一个整体,包括上行、下行、汇总
其中手机号是text类型,value是一个自定义的类型,注意要实现序列化。
举例:key:13705469875 value:< upFlow:1000, dFlow:1000, sumFlow:2000 >
reduce:
输入:接受一个以手机号识别的key,以及这个手机号对应的bean的对象集合。
例如: key:13705469875
value: < upFlow:200, dFlow:1100, sumFlow:1300 >,
< upFlow:788, dFlow:1200, sumFlow:1988 >
迭代bean对象集合,累加各项,形成一个新的bean对象,例如:
< upFlow:200+788, dFlow:1100+1200, sumFlow:1300+1988 >
第二步,上代码
sample文件
手机号 上行 下行
13705469875 200 1100
13821690520 1000 1000
13665412354 330 660
13388765498 1200 1500
15758923125 500 886
13122512121 1140 778
13665412354 255 333
13705469875 788 1200
15758923125 12 60
FlowBean
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowBean implements Writable {
private long upFlow;
private long dFlow;
private long sumFlow;
public FlowBean(){
}
public FlowBean(long upFlow,long dFlow){
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow = upFlow + dFlow;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getdFlow() {
return dFlow;
}
public void setdFlow(long dFlow) {
this.dFlow = dFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
@Override
public void write(DataOutput out) throws IOException{
out.writeLong(upFlow);
out.writeLong(dFlow);
out.writeLong(sumFlow);
}
@Override
public void readFields(DataInput in) throws IOException {
upFlow = in.readLong();
dFlow = in.readLong();
sumFlow = in.readLong();
}
@Override
public String toString(){
return this.upFlow + "-" + this.dFlow + "-" + this.sumFlow;
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
public class FlowCount {
//实现map类。将获取到的一行数据拆分,并分别赋值给手机号、上下行流量
static class FlowCountMapper extends Mapper<Object,LongWritable,Text,FlowBean>{
protected void map(Object key,Text value,Mapper<Object,LongWritable,Text,FlowBean> mp,Context context)
throws IOException,InterruptedException{
String line = value .toString();
String[] fields = line.split("\t");
//取出手机号和上下行流量
String phoneNo = fields[0];
Long upflow = Long.parseLong(fields[1]);
Long dflow = Long.parseLong(fields[2]);
context.write(new Text(phoneNo), new FlowBean(upflow,dflow));
}
}
static class FlowCountReduce extends Reducer<Text,FlowBean ,Text, FlowBean>{
protected void reduce(Text key,Iterable<FlowBean> values, Reducer<Text,FlowBean ,Text, FlowBean> rd,Context context)
throws IOException,InterruptedException{
long sum_upFlow = 0;
long sum_dFlow = 0;
for(FlowBean bean : values){
sum_upFlow += bean.getUpFlow();
sum_dFlow += bean.getdFlow();
}
context.write(key, new FlowBean(sum_upFlow, sum_dFlow));
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
Job job = null;
String[] ars = null;
String[] otherArgs = null;
try {
conf.set("mapred.job.tracker", "192.168.122.11:9001");
ars=new String[]{"/usr/local/hadoop/flowcount","/usr/local/hadoop/outputFlowCount"};
otherArgs = new GenericOptionsParser(conf, ars).getRemainingArgs();
if (otherArgs.length < 2){
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
job = new Job(conf,"flow count");
job .setJarByClass(FlowCount.class);
//设置map、reduce处理类
job .setMapperClass(FlowCountMapper.class);
//在这里不需要这个combine类
// job .setCombinerClass(Reduce.class);
job .setReducerClass(FlowCountReduce.class);
//设置输出类型
job .setOutputKeyClass(Text.class);
job .setOutputValueClass(FlowBean.class);
//设置输入和输出目录。
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
第三部、问题
上面的代码是有问题的代码(仅FlowCount类)
但是我开始犯了个错误,就是自定义的Mapper和Reducer类中的map和reduce方法都没有加@Override注解
然后还一个问题就是我的Mapper类的泛型和map方法的参数类型不一致(都是前两个)。所以运行时产生了异常
java.lang.Exception: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:489)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:549)
Caused by: java.io.IOException: Type mismatch in key from map: expected org.apache.hadoop.io.Text, received org.apache.hadoop.io.LongWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1072)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:715)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at org.apache.hadoop.mapreduce.Mapper.map(Mapper.java:125)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:270)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
at java.util.concurrent.FutureTask.run(FutureTask.java:166)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:722)
然后我的解决过程,就是修改了输出方法对应的数据类型,既然报错信息说期望的是Text,但实际收到的是LongWritable
那么我就给它Text类型。最后虽然是不报错了,但是最终结果不是我要的结果
0 13705469875 200 1100
22 13821690521 1000 1000
45 13665412354 330 660
66 13388765498 1200 1500
89 15758923125 500 886
110 13122512121 1140 778
132 13665412354 255 333
153 13705469875 788 1200
175 15758923125 12 60
我期望的结果是那种经过计算和汇总的结果,而非这样照搬过来的结果
定位错误应该不是我的输出类型的问题。然后我根据这个错误差了些网上的解释,
有的说map出的和reduce入的的数据类型 比如这位仁兄
还有人说是是新旧API的混用问题 比如这位兄台
然后我按照第二位的建议,增加了Override。在增加Override的过程中,我才发现增加不能
原因就是上面说的解决方案。下面是修改后的代码
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.GenericOptionsParser;
public class FlowCount {
//实现map类。将获取到的一行数据拆分,并分别赋值给手机号、上下行流量
static class FlowCountMapper extends Mapper<LongWritable,Text,Text,FlowBean>{
@Override
protected void map(LongWritable key,Text value,Context context)
throws IOException,InterruptedException{
String line = value .toString();
String[] fields = line.split("\t");
//取出手机号和上下行流量
String phoneNo = fields[0];
Long upflow = Long.parseLong(fields[1]);
Long dflow = Long.parseLong(fields[2]);
context.write(new Text(phoneNo), new FlowBean(upflow,dflow));
}
}
static class FlowCountReduce extends Reducer<Text,FlowBean ,Text, FlowBean>{
@Override
protected void reduce(Text key,Iterable<FlowBean> values, Context context)
throws IOException,InterruptedException{
long sum_upFlow = 0;
long sum_dFlow = 0;
for(FlowBean bean : values){
sum_upFlow += bean.getUpFlow();
sum_dFlow += bean.getdFlow();
}
context.write(key, new FlowBean(sum_upFlow, sum_dFlow));
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
Job job = null;
String[] ars = null;
String[] otherArgs = null;
try {
conf.set("mapred.job.tracker", "192.168.122.11:9001");
ars=new String[]{"/usr/local/hadoop/flowcount","/usr/local/hadoop/outputFlowCount"};
otherArgs = new GenericOptionsParser(conf, ars).getRemainingArgs();
if (otherArgs.length < 2){
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
job = new Job(conf,"flow count");
job .setJarByClass(FlowCount.class);
//设置map、reduce处理类
job .setMapperClass(FlowCountMapper.class);
//在这里不需要这个combine类
// job .setCombinerClass(Reduce.class);
job .setReducerClass(FlowCountReduce.class);
//设置输出类型
job .setOutputKeyClass(Text.class);
job .setOutputValueClass(FlowBean.class);
//设置输入和输出目录。
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
最终运行的结果如下
13122512121 1140-778-1918
13388765498 1200-1500-2700
13665412354 585-993-1578
13705469875 988-2300-3288
13821690521 1000-1000-2000
15758923125 512-946-1458
四,结论
第一就是,要严格注意Mapper与map,Reducer与reduce的泛型和参数的对应问题
第二就是,加上Override是靠谱的选择 参考一下吧