package com.analyzer.search_task;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
public class SearchBrand extends Base{
public static class FileMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
if(value==null){
return;
}
String str = value.toString();
if(str==null || str.length()<2){
return;
}
String[] array =str.split("\\|");
if(array==null || array.length<1){
return;
}
String key_str="null";
for(String item : array){
if(item!=null && item.contains("$")){
key_str = item.trim();
break;
}
}
for(String item : array){
if(item!=null && item.contains("@")){
context.write(new Text(item.trim()), new Text(array[0]+"|"+key_str.trim()));
}
}
array=null;
value.clear();
}
}
public static class ActionReducer extends Reducer<Text, Text, NullWritable, Text> {
/**
* 设置文件多路输出
*
* */
private MultipleOutputs<NullWritable,Text> mos;
@Override
protected void setup(Context context) throws IOException,InterruptedException {
mos = new MultipleOutputs<NullWritable,Text>(context);
}
@Override
protected void cleanup(Context context) throws IOException,InterruptedException {
if(mos!=null){//注意要调用close方法,否则会没有输出
mos.close();
mos =null;
}
}
@Override
protected void reduce(Text key, Iterable<Text> iter,Context context) throws IOException,InterruptedException {
if(iter==null){
return;
}
context.getCounter(CounterRecorder.TOTAL).increment(1);
for(Text item : iter){
if(item==null){
continue;
}
String str = item.toString();
if(!StringUtils.isBlank(str) && !isBrand(str)){//不为空且不是品牌信息,就输出
String[]array =str.split("\\|", -1);
if(array!=null && array.length>1){//去掉关键字key
mos.write("key",NullWritable.get(),new Text(key.toString()+"|"+array[1]));
context.getCounter(CounterRecorder.SUCCEED).increment(1);
}else{//多路输出需要完善的信息
context.getCounter(CounterRecorder.FAILE).increment(1);
mos.write("unkey",NullWritable.get(),new Text(key.toString()+"--"+str));
}
}else{//多路输出需要完善的信息
context.getCounter(CounterRecorder.FAILE).increment(1);
mos.write("unkey",NullWritable.get(),new Text(key.toString()+"--"+str));
}
}
}
/**
* 判断是否品牌方法
*
* @param str
* sm|samsung$三星a510
*
* @return boolean
* */
private boolean isBrand(String str){
String [] datas = str.split("\\|");
if(datas!=null && datas.length>1){
if("sm".equals(datas[0])){
datas[0] = "samsung";
}
if(datas[1]!=null && datas[1].startsWith(datas[0])){ //品牌信息
datas =null;
return true;
}
}
return false;
}
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Configuration conf =this.getConf();
Job job = Job.getInstance(conf);
job.setJobName("SearSbandTask_T");
job.setJarByClass(SearchBrand.class);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(FileMapper.class);
job.setReducerClass(ActionReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//--------------------添加多路输出----------------------------
//BaseOnKey的输出
MultipleOutputs.addNamedOutput(job,"key",TextOutputFormat.class,NullWritable.class,Text.class);
//没有区分的所有输出
MultipleOutputs.addNamedOutput(job,"unkey",TextOutputFormat.class,NullWritable.class,Text.class);
//取消part-r-00000新式文件输出
LazyOutputFormat.setOutputFormatClass(job,TextOutputFormat.class);
//-------------------设置输出路径-----------------------------------
Path inPath = new Path(args[0]);
Path outPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)){
fs.deleteOnExit(outPath);
}
fs.close();
FileInputFormat.addInputPath(job,inPath);
FileOutputFormat.setOutputPath(job,outPath);
//---------------------控制Reduce的个数------------------------------
//job.setNumReduceTasks(0);
//---------------------输出统计数字-----------------------------------
boolean isCompletion =job.waitForCompletion(true);
if(isCompletion) {
System.out.println("Total num:" + job.getCounters().findCounter(CounterRecorder.TOTAL).getValue());
System.out.println("key num:" + job.getCounters().findCounter(CounterRecorder.SUCCEED).getValue());
System.out.println("unkey num:" + job.getCounters().findCounter(CounterRecorder.FAILE).getValue());
}
return isCompletion ? 0 : 1;
}
public static int startTask(Configuration con,String[] args) throws Exception{
return ToolRunner.run(con,new SearchBrand(),args);
}
public static void main(String[]args) throws Exception{
Configuration con =new Configuration();
startTask(con,new String[]{"/user/out_temp/UA-r-00000","/user/sband"});
}
}
Hadoop MultipleOutput例子
最新推荐文章于 2024-04-18 17:15:09 发布