MultipleOutputs---多文件输出

package com.bj58.search.experience.searchcommunityname;

import com.bj58.search.qa.contract.agent.IQAService;
import com.bj58.search.qa.contract.entity.*;
import com.bj58.spat.scf.client.SCFInit;
import com.bj58.spat.scf.client.proxy.builder.ProxyFactory;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Pattern;

/**
 * @Author:jieping
 * @Description:租房下用户搜索query中小区词的提取
 * @Date: Create in 21:45 2019/9/9
 */
public class imeikeywordlocalcn extends Configured implements Tool {

    private static IQAService service;

    public static class imeikeywordlocalcnMapper extends Mapper<LongWritable, Text, Text, Text> {

        public boolean isNumeric(String str) {
            Pattern pattern = Pattern.compile("^[-\\+]?[\\d]*$");
            return pattern.matcher(str).matches();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String logStr = value.toString();
            InputSplit inputSplit = context.getInputSplit();
            Path p = ((FileSplit) inputSplit).getPath();
            String pstr = p.toString();
            String ds_cityid = "";
            String keyword = "";
            if (pstr.contains("zf_imeisearch")) {
                String [] logstrArray = logStr.trim().split("\t");
                if(logstrArray.length != 7){
                    context.getCounter("##imeikeywordlocalcnMapper", "输入文件的字段不是7个").increment(1);
                    return;
                }
                keyword = logstrArray[2];
                ds_cityid = logstrArray[5];
                if((!keyword.equals("")) && (!ds_cityid.equals(""))){
                    context.write(new Text(keyword + "_" + ds_cityid), new Text(logStr.trim()));
//                    System.out.println(keyword + "_" + ds_cityid + "\t" + logStr.trim());
                }else{
                    context.getCounter("##imeikeywordlocalcnMapper", "keyword and ds_cityid key is null").increment(1);
                }
            }
        }
    }

    public static class imeikeywordlocalcnReducer extends Reducer<Text, Text, Text, Text> {

        private static final long DEVTIME = 400; // ms
        private long scfQps = 200;     // per minute
        private long scfTimes = 10;    // ms
        private static final double ratio = 0.75;
        private static boolean bFirst = true;
        private static long starttime = 0;
        private static long startId = 0;
        private static long id = 0;
        private MultipleOutputs<Text,Text> mos;
        private static String date;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            mos = new MultipleOutputs<Text, Text>(context);
            date = context.getConfiguration().get("date");
//            String os=System.getProperties().getProperty("os.name");
//            String userDir="";
//            String scfkeyPath="";
//            System.out.println("system style is " + os);
//            if(os.equals("Linux"))
//            {
//                userDir = JarToolUtil.getJarPath();
//                System.out.println("userDir:" + userDir);
//                scfkeyPath = userDir + "/scfkey.key";
//            } else {
//                userDir = System.getProperty("user.dir");
//                scfkeyPath = userDir + "\\src\\resources\\config\\scfkey.key";
//            }
            SCFInit.initScfKeyByValue("mCXt7Cx0XYCcdvveK9+kiJZBpNhMGJnz");
            service = ProxyFactory.create(IQAService.class, "tcp://" + "qaservice" + "/QAService");

            Integer allQps = 60000;
            Integer reduceNum = 10;
            //为了防止误差,尽可能将一个DEVTIME内的数设置成整数,防止出现浪费
            //reduceNum的个数尽可能的小,值越大则会导致误差越大
            scfQps = (long)(allQps / reduceNum * ratio);
            scfTimes = (scfQps / 60 * DEVTIME / 1000 > 0) ? scfQps / 60 * DEVTIME / 1000 : 1;
        }

        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String [] kk = key.toString().split("_");
//            System.out.println(key.toString());
            context.getCounter("##imeikeywordlocalcnReducer", "reduce input num is").increment(1);
            if(kk.length != 2) {
                context.getCounter("##imeikeywordlocalcnReducer", "reduce of key's length not two num is").increment(1);
                return;
            }
            wait2RunService();
            QARequest request = new QARequest();
            request.setText(kk[0]);
            request.setCityId(kk[1]);
            request.setType(LocalTypeEnum.local58);
            QAResult result = null;
            try {
                result = service.queryAnalysis(request);
            } catch (Exception e) {
//                System.out.println("keyword:" + kk[0] + "localid:" + kk[1]);
                context.getCounter("##imeikeywordlocalcnReducer", "service.queryAnalysis is not right").increment(1);
                e.printStackTrace();
            }

            if((result != null) && (result.getTagMap().size() > 0) ){
                List<String> cnames = new ArrayList<String>();
                List<String> cityIDcIdName = new ArrayList<String>();//城市id_小区id_小区名
//                Map<String, String> cIdName = new HashMap<String, String>();
                Map<TagTypeEnum, List<TagElement>> tagMap = result.getTagMap();
                if(tagMap.containsKey(TagTypeEnum.community)){
                    for (TagElement element : tagMap.get(TagTypeEnum.community)) {
                        if(element == null){
                            context.getCounter("##imeikeywordlocalcnReducer", "TagElement小区元素为空").increment(1);
                            System.out.print("当前key为:" + key +",小区城市id:" + kk[1]);
                            continue;
                        }
                        if(element.getText() == null){
                            context.getCounter("##imeikeywordlocalcnReducer", "小区名为空").increment(1);
                            System.out.print("当前key为:" + key +",小区城市id:" + kk[1]);
                            continue;
                        }
                        cnames.add(element.getText());
//                        String cityID = element.getValues().get("city58Id").toString();
//                        System.out.print("当前key为:" + key +",小区城市id:" + cityID);
                        String cityID = kk[1];
                        String communityID = "-999";
                        boolean flag = element.getValues().containsKey("community58Id");
                        if(flag){
                            communityID = element.getValues().get("community58Id").toString();
                        }
//                        System.out.println(",小区id:" + communityID);
                        cityIDcIdName.add(cityID + "_" + communityID + "_" + element.getText());
//                        System.out.print(element.getText()+" ");
//                        System.out.print("当前query识别出小区的长度为:"+cnames.size());
                    }
                    StringBuffer bu = new StringBuffer();
                    StringBuffer bu2 = new StringBuffer();
                    for (String name : cnames) {
                        if(name.equals(cnames.get(cnames.size()-1))){
                            bu.append(name.trim());
                        }else {
                            bu.append(name.trim()).append("|");
                        }
                    }
                    for (String name : cityIDcIdName) {
                        if(name.equals(cityIDcIdName.get(cityIDcIdName.size()-1))){
                            bu2.append(name.trim());
                        }else {
                            bu2.append(name.trim()).append("|");
                        }
                    }
                    //打印输出数据
                    for (Text v : values) {
                        String [] logstrArray = v.toString().trim().split("\t");
                        String imei = logstrArray[1].trim();
                        String keywoqd = logstrArray[2].trim();
                        String discateid = logstrArray[3].trim();
                        String disareapath = logstrArray[6].trim();
                        StringBuffer output = new StringBuffer();
                        output.append(imei).append("_").append(keywoqd).append("_").append(discateid).append("_").append(disareapath);
//                        context.write(new Text(bu.toString()), new Text(output.toString()));
                        mos.write("imeikeywordlocalcn1", new Text(bu.toString()), new Text(output.toString()), "imeikeywordlocalcn1"+"/");//以imeikeywordlocalcn1命名的文件夹1
                        mos.write("imeikeywordlocalcncitycnameid", new Text(bu2.toString()), new Text(output.toString()),"imeikeywordlocalcncitycnameid"+"/");//以imeikeywordlocalcn1命名的文件夹2
//                        System.out.println(bu.toString() + "\t" + output.toString());
                    }
                }
            } else {
//                System.out.print("当前query使用qaservice服务没有任何识别结果!!!");
//                System.out.println("keyword:" + kk[0] +"\t"+ "localid:" + kk[1] +"\t"+ "result:" + result.toString());
                context.getCounter("##imeikeywordlocalcnReducer", "qaservice result is null").increment(1);
                return;
            }
            context.getCounter("##imeikeywordlocalcnReducer", "qaservice result is not null for keyword and ds_cityid's key count").increment(1);
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            //关闭多文件输出对象,刷新缓存数据
            mos.close();
        }

        public void wait2RunService() {
            if (bFirst) {
                starttime = System.currentTimeMillis();
                bFirst = false;
            }
            if (id - startId >= scfTimes) {
                long curtime = System.currentTimeMillis();
                if (curtime - starttime < DEVTIME) {
                    try {
                        Thread.sleep(DEVTIME + starttime - curtime);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                starttime = System.currentTimeMillis();
                startId = id;
            }
            id++;
        }
    }

    public int run(String[] args) throws Exception {
        Options opts = new Options();
        opts.addOption("h", "help", false, "Print this help message")
                .addOption("i", "input", true, "input path")
                .addOption("o", "output", true, "output path")
                .addOption("d","date",true,"date");
        CommandLine cmd = null;
        String inputStr = "";
        String outputStr = "";
        String date = "";
        try {
            cmd = new GnuParser().parse(opts, args);
            if (cmd.hasOption("help")) {
                new HelpFormatter().printHelp("Usage: cmd [OPTIONS]", opts);
                return 0;
            }
            inputStr = cmd.getOptionValue("i");
            outputStr = cmd.getOptionValue("o");
            date = cmd.getOptionValue("d");
        } catch (Exception e) {
            e.printStackTrace();
            return 0;
        }
        Configuration conf = this.getConf();
        conf.set("mapreduce.map.output.compress", "false");
        conf.set("mapreduce.output.fileoutputformat.compress", "false");
        conf.set("date", date);
//        Path outputPath = new Path(outputStr+"/"+date);
        Path outputPath = new Path(outputStr + "/" + date);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        Job job = Job.getInstance(conf);
        String[] inputs = inputStr.split(";");
        for (String s : inputs) {
            if (fs.exists(new Path(s + "/" + date)))
            {
                System.out.println("input path:" + s + "/" + date);
                FileInputFormat.addInputPath(job, new Path(s + "/" + date));
            }
        }
        System.out.println("output path:" + outputPath);
        job.setJarByClass(imeikeywordlocalcn.class);
        job.setJobName("imeikeywordlocalcnDocData");
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        System.out.println("imeikeywordlocalcnMapper is beginning################");
        job.setMapperClass(imeikeywordlocalcn.imeikeywordlocalcnMapper.class);
        System.out.println("imeikeywordlocalcnReducer is beginning################");
        job.setReducerClass(imeikeywordlocalcn.imeikeywordlocalcnReducer.class);
        MultipleOutputs.addNamedOutput(job, "imeikeywordlocalcn1", TextOutputFormat.class, Text.class, Text.class);
        MultipleOutputs.addNamedOutput(job, "imeikeywordlocalcncitycnameid", TextOutputFormat.class, Text.class, Text.class);
        FileOutputFormat.setOutputPath(job, outputPath);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setNumReduceTasks(10);
        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        String inputPath = otherArgs[0];
        String outpath = otherArgs[1];
//        String outpath2 = otherArgs[2];
        String date = otherArgs[2];
        conf.set("mapreduce.job.queuename", "root.offline.hdp_teu_search.normal");
        conf.set("mapreduce.task.timeout", "0");
        conf.set("RunMode", "Online");
//        conf.set("ScfQps", otherArgs[3]);
//        conf.set("ReduceTask", otherArgs[4]);
        String[] args1={
                "-i",inputPath,
                "-o",outpath,
                "-d",date};
        ToolRunner.run(conf, new imeikeywordlocalcn(), args1);
    }
}

说明:

1.实际会在输出路径下创建两个文件夹,分别是imeikeywordlocalcn1和imeikeywordlocalcncitycnameid

2.使用多文件输出一定要clean,不然每次执行输出的文件大小不一样,非正常MR流程

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值