1. 虚拟这样一个数据场景
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:fail;ERRORTYPE:lost DB connection
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:fail;ERRORTYPE:lost DB connection
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:success;
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:sucess
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:fail;ERRORTYPE:connection fail
2.总共有10个这样的文件存在HDFS上
[leo@hadoop adt]$ hadoop fs -ls /user/leo/ADT/in
Warning: $HADOOP_HOME is deprecated.
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:fail;ERRORTYPE:lost DB connection
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:fail;ERRORTYPE:lost DB connection
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:success;
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:sucess
FRESHDATE:201306;REPORTNAME:Cagpmini Global;RESULT:fail;ERRORTYPE:connection fail
2.总共有10个这样的文件存在HDFS上
[leo@hadoop adt]$ hadoop fs -ls /user/leo/ADT/in
Warning: $HADOOP_HOME is deprecated.
Found 10 items
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 02:57 /user/leo/ADT/in/schedule.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule10.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule2.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule3.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule4.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule5.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule6.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule7.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule8.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule9.txt
3. 模拟这样一个需求,将按月统计出每张报表成功的次数,失败的次数:结果如下
Cagpmini Global(报表名称) 201307(月份) 23(成功次数) 45(失败次数)
设计思路:
1.map的输出的key为报表名称+月份,value为(成功的次数,失败的次数)
2.reduce 循环统计成功和失败的次数
Mapper:
[leo@hadoop ADT]$ cat ADTMapper.java
package ADT;
import java.io.IOException;
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 02:57 /user/leo/ADT/in/schedule.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule10.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule2.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule3.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule4.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:01 /user/leo/ADT/in/schedule5.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule6.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule7.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule8.txt
-rw-r--r-- 1 leo supergroup 4605 2013-08-24 03:02 /user/leo/ADT/in/schedule9.txt
3. 模拟这样一个需求,将按月统计出每张报表成功的次数,失败的次数:结果如下
Cagpmini Global(报表名称) 201307(月份) 23(成功次数) 45(失败次数)
设计思路:
1.map的输出的key为报表名称+月份,value为(成功的次数,失败的次数)
2.reduce 循环统计成功和失败的次数
Mapper:
[leo@hadoop ADT]$ cat ADTMapper.java
package ADT;
import java.io.IOException;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class ADTMapper extends Mapper<Object,Text,Text,Text> {
public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
Text status=new Text("fail");
Text mapkey=new Text();
Text report=new Text();
Text month=new Text();
IntWritable monthS=new IntWritable(value.toString().toLowerCase().indexOf("freshdate")+10);
IntWritable monthE=new IntWritable(value.toString().toLowerCase().indexOf(";", monthS.get()));
IntWritable reportS=new IntWritable(value.toString().toLowerCase().indexOf("reportname")+11);
IntWritable reportE=new IntWritable(value.toString().toLowerCase().indexOf(";", reportS.get()));
report.set(value.toString().substring(reportS.get(), reportE.get()));
month.set(value.toString().substring(monthS.get(),monthE.get()));
Text status=new Text("fail");
Text mapkey=new Text();
Text report=new Text();
Text month=new Text();
IntWritable monthS=new IntWritable(value.toString().toLowerCase().indexOf("freshdate")+10);
IntWritable monthE=new IntWritable(value.toString().toLowerCase().indexOf(";", monthS.get()));
IntWritable reportS=new IntWritable(value.toString().toLowerCase().indexOf("reportname")+11);
IntWritable reportE=new IntWritable(value.toString().toLowerCase().indexOf(";", reportS.get()));
report.set(value.toString().substring(reportS.get(), reportE.get()));
month.set(value.toString().substring(monthS.get(),monthE.get()));
if(value.toString().toLowerCase().indexOf("success")>-1||value.toString().toLowerCase().indexOf("sucess")>-1)
status.set("success");
context.write(new Text(report.toString()+" "+month.toString()), status);
status.set("success");
context.write(new Text(report.toString()+" "+month.toString()), status);
}
}
Reducer:
[leo@hadoop ADT]$ cat ADTReducer.java
package ADT;
import java.io.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
Reducer:
[leo@hadoop ADT]$ cat ADTReducer.java
package ADT;
import java.io.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
public class ADTReducer extends Reducer<Text,Text,Text,Text>{
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{
int success=0;
int fail=0;
Text result=new Text();
for(Text value:values){
if(value.toString().toLowerCase().indexOf("success")>-1)
success+=1;
else
fail+=1;
}
result.set(success+" "+fail);
context.write(key, result);
}
int success=0;
int fail=0;
Text result=new Text();
for(Text value:values){
if(value.toString().toLowerCase().indexOf("success")>-1)
success+=1;
else
fail+=1;
}
result.set(success+" "+fail);
context.write(key, result);
}
}
Driver
[leo@hadoop ADT]$ cat ADTDriver.java
package ADT;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.GenericOptionsParser;
import java.util.Date;
public class ADTDriver {
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
Job job=new Job(conf,"adt");
job.setJarByClass(ADTDriver.class);
job.setMapperClass(ADTMapper.class);
job.setReducerClass(ADTReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
查看执行结果
[leo@hadoop ADT]$ hadoop fs -cat /user/leo/ADT/out/part-r-00000
Warning: $HADOOP_HOME is deprecated.
Driver
[leo@hadoop ADT]$ cat ADTDriver.java
package ADT;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;
import org.apache.hadoop.util.GenericOptionsParser;
import java.util.Date;
public class ADTDriver {
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
Job job=new Job(conf,"adt");
job.setJarByClass(ADTDriver.class);
job.setMapperClass(ADTMapper.class);
job.setReducerClass(ADTReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
查看执行结果
[leo@hadoop ADT]$ hadoop fs -cat /user/leo/ADT/out/part-r-00000
Warning: $HADOOP_HOME is deprecated.
Cagpmini Global 201306 40 100
Cagpmini Global 201307 20 50
Kronos ES Monthly 201306 40 100
Kronos ES Monthly 201307 20 50
P AND G 201306 40 100
P AND G 201307 20 50
[leo@hadoop ADT]$
以上结果Cagpmini 在6月success的次数为40
通过本地验证一下:
[leo@hadoop adt]$ sed -n '/Cagpmini/p' schedule.txt |sed -n "/sucess\|success/p" | sed -n "/201306/p" | wc
4 8 236
[leo@hadoop adt]$
由于hdfs上有10个这样的副本所以数据时吻合的
Cagpmini Global 201307 20 50
Kronos ES Monthly 201306 40 100
Kronos ES Monthly 201307 20 50
P AND G 201306 40 100
P AND G 201307 20 50
[leo@hadoop ADT]$
以上结果Cagpmini 在6月success的次数为40
通过本地验证一下:
[leo@hadoop adt]$ sed -n '/Cagpmini/p' schedule.txt |sed -n "/sucess\|success/p" | sed -n "/201306/p" | wc
4 8 236
[leo@hadoop adt]$
由于hdfs上有10个这样的副本所以数据时吻合的