package com.bj58.search.experience.searchcommunityname;
import com.bj58.search.qa.contract.agent.IQAService;
import com.bj58.search.qa.contract.entity.*;
import com.bj58.spat.scf.client.SCFInit;
import com.bj58.spat.scf.client.proxy.builder.ProxyFactory;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Pattern;
/**
* @Author:jieping
* @Description:租房下用户搜索query中小区词的提取
* @Date: Create in 21:45 2019/9/9
*/
public class imeikeywordlocalcn extends Configured implements Tool {
private static IQAService service;
public static class imeikeywordlocalcnMapper extends Mapper<LongWritable, Text, Text, Text> {
public boolean isNumeric(String str) {
Pattern pattern = Pattern.compile("^[-\\+]?[\\d]*$");
return pattern.matcher(str).matches();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String logStr = value.toString();
InputSplit inputSplit = context.getInputSplit();
Path p = ((FileSplit) inputSplit).getPath();
String pstr = p.toString();
String ds_cityid = "";
String keyword = "";
if (pstr.contains("zf_imeisearch")) {
String [] logstrArray = logStr.trim().split("\t");
if(logstrArray.length != 7){
context.getCounter("##imeikeywordlocalcnMapper", "输入文件的字段不是7个").increment(1);
return;
}
keyword = logstrArray[2];
ds_cityid = logstrArray[5];
if((!keyword.equals("")) && (!ds_cityid.equals(""))){
context.write(new Text(keyword + "_" + ds_cityid), new Text(logStr.trim()));
// System.out.println(keyword + "_" + ds_cityid + "\t" + logStr.trim());
}else{
context.getCounter("##imeikeywordlocalcnMapper", "keyword and ds_cityid key is null").increment(1);
}
}
}
}
public static class imeikeywordlocalcnReducer extends Reducer<Text, Text, Text, Text> {
private static final long DEVTIME = 400; // ms
private long scfQps = 200; // per minute
private long scfTimes = 10; // ms
private static final double ratio = 0.75;
private static boolean bFirst = true;
private static long starttime = 0;
private static long startId = 0;
private static long id = 0;
private MultipleOutputs<Text,Text> mos;
private static String date;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
mos = new MultipleOutputs<Text, Text>(context);
date = context.getConfiguration().get("date");
// String os=System.getProperties().getProperty("os.name");
// String userDir="";
// String scfkeyPath="";
// System.out.println("system style is " + os);
// if(os.equals("Linux"))
// {
// userDir = JarToolUtil.getJarPath();
// System.out.println("userDir:" + userDir);
// scfkeyPath = userDir + "/scfkey.key";
// } else {
// userDir = System.getProperty("user.dir");
// scfkeyPath = userDir + "\\src\\resources\\config\\scfkey.key";
// }
SCFInit.initScfKeyByValue("mCXt7Cx0XYCcdvveK9+kiJZBpNhMGJnz");
service = ProxyFactory.create(IQAService.class, "tcp://" + "qaservice" + "/QAService");
Integer allQps = 60000;
Integer reduceNum = 10;
//为了防止误差,尽可能将一个DEVTIME内的数设置成整数,防止出现浪费
//reduceNum的个数尽可能的小,值越大则会导致误差越大
scfQps = (long)(allQps / reduceNum * ratio);
scfTimes = (scfQps / 60 * DEVTIME / 1000 > 0) ? scfQps / 60 * DEVTIME / 1000 : 1;
}
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
String [] kk = key.toString().split("_");
// System.out.println(key.toString());
context.getCounter("##imeikeywordlocalcnReducer", "reduce input num is").increment(1);
if(kk.length != 2) {
context.getCounter("##imeikeywordlocalcnReducer", "reduce of key's length not two num is").increment(1);
return;
}
wait2RunService();
QARequest request = new QARequest();
request.setText(kk[0]);
request.setCityId(kk[1]);
request.setType(LocalTypeEnum.local58);
QAResult result = null;
try {
result = service.queryAnalysis(request);
} catch (Exception e) {
// System.out.println("keyword:" + kk[0] + "localid:" + kk[1]);
context.getCounter("##imeikeywordlocalcnReducer", "service.queryAnalysis is not right").increment(1);
e.printStackTrace();
}
if((result != null) && (result.getTagMap().size() > 0) ){
List<String> cnames = new ArrayList<String>();
List<String> cityIDcIdName = new ArrayList<String>();//城市id_小区id_小区名
// Map<String, String> cIdName = new HashMap<String, String>();
Map<TagTypeEnum, List<TagElement>> tagMap = result.getTagMap();
if(tagMap.containsKey(TagTypeEnum.community)){
for (TagElement element : tagMap.get(TagTypeEnum.community)) {
if(element == null){
context.getCounter("##imeikeywordlocalcnReducer", "TagElement小区元素为空").increment(1);
System.out.print("当前key为:" + key +",小区城市id:" + kk[1]);
continue;
}
if(element.getText() == null){
context.getCounter("##imeikeywordlocalcnReducer", "小区名为空").increment(1);
System.out.print("当前key为:" + key +",小区城市id:" + kk[1]);
continue;
}
cnames.add(element.getText());
// String cityID = element.getValues().get("city58Id").toString();
// System.out.print("当前key为:" + key +",小区城市id:" + cityID);
String cityID = kk[1];
String communityID = "-999";
boolean flag = element.getValues().containsKey("community58Id");
if(flag){
communityID = element.getValues().get("community58Id").toString();
}
// System.out.println(",小区id:" + communityID);
cityIDcIdName.add(cityID + "_" + communityID + "_" + element.getText());
// System.out.print(element.getText()+" ");
// System.out.print("当前query识别出小区的长度为:"+cnames.size());
}
StringBuffer bu = new StringBuffer();
StringBuffer bu2 = new StringBuffer();
for (String name : cnames) {
if(name.equals(cnames.get(cnames.size()-1))){
bu.append(name.trim());
}else {
bu.append(name.trim()).append("|");
}
}
for (String name : cityIDcIdName) {
if(name.equals(cityIDcIdName.get(cityIDcIdName.size()-1))){
bu2.append(name.trim());
}else {
bu2.append(name.trim()).append("|");
}
}
//打印输出数据
for (Text v : values) {
String [] logstrArray = v.toString().trim().split("\t");
String imei = logstrArray[1].trim();
String keywoqd = logstrArray[2].trim();
String discateid = logstrArray[3].trim();
String disareapath = logstrArray[6].trim();
StringBuffer output = new StringBuffer();
output.append(imei).append("_").append(keywoqd).append("_").append(discateid).append("_").append(disareapath);
// context.write(new Text(bu.toString()), new Text(output.toString()));
mos.write("imeikeywordlocalcn1", new Text(bu.toString()), new Text(output.toString()), "imeikeywordlocalcn1"+"/");//以imeikeywordlocalcn1命名的文件夹1
mos.write("imeikeywordlocalcncitycnameid", new Text(bu2.toString()), new Text(output.toString()),"imeikeywordlocalcncitycnameid"+"/");//以imeikeywordlocalcn1命名的文件夹2
// System.out.println(bu.toString() + "\t" + output.toString());
}
}
} else {
// System.out.print("当前query使用qaservice服务没有任何识别结果!!!");
// System.out.println("keyword:" + kk[0] +"\t"+ "localid:" + kk[1] +"\t"+ "result:" + result.toString());
context.getCounter("##imeikeywordlocalcnReducer", "qaservice result is null").increment(1);
return;
}
context.getCounter("##imeikeywordlocalcnReducer", "qaservice result is not null for keyword and ds_cityid's key count").increment(1);
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
//关闭多文件输出对象,刷新缓存数据
mos.close();
}
public void wait2RunService() {
if (bFirst) {
starttime = System.currentTimeMillis();
bFirst = false;
}
if (id - startId >= scfTimes) {
long curtime = System.currentTimeMillis();
if (curtime - starttime < DEVTIME) {
try {
Thread.sleep(DEVTIME + starttime - curtime);
} catch (Exception e) {
e.printStackTrace();
}
}
starttime = System.currentTimeMillis();
startId = id;
}
id++;
}
}
public int run(String[] args) throws Exception {
Options opts = new Options();
opts.addOption("h", "help", false, "Print this help message")
.addOption("i", "input", true, "input path")
.addOption("o", "output", true, "output path")
.addOption("d","date",true,"date");
CommandLine cmd = null;
String inputStr = "";
String outputStr = "";
String date = "";
try {
cmd = new GnuParser().parse(opts, args);
if (cmd.hasOption("help")) {
new HelpFormatter().printHelp("Usage: cmd [OPTIONS]", opts);
return 0;
}
inputStr = cmd.getOptionValue("i");
outputStr = cmd.getOptionValue("o");
date = cmd.getOptionValue("d");
} catch (Exception e) {
e.printStackTrace();
return 0;
}
Configuration conf = this.getConf();
conf.set("mapreduce.map.output.compress", "false");
conf.set("mapreduce.output.fileoutputformat.compress", "false");
conf.set("date", date);
// Path outputPath = new Path(outputStr+"/"+date);
Path outputPath = new Path(outputStr + "/" + date);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
Job job = Job.getInstance(conf);
String[] inputs = inputStr.split(";");
for (String s : inputs) {
if (fs.exists(new Path(s + "/" + date)))
{
System.out.println("input path:" + s + "/" + date);
FileInputFormat.addInputPath(job, new Path(s + "/" + date));
}
}
System.out.println("output path:" + outputPath);
job.setJarByClass(imeikeywordlocalcn.class);
job.setJobName("imeikeywordlocalcnDocData");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
System.out.println("imeikeywordlocalcnMapper is beginning################");
job.setMapperClass(imeikeywordlocalcn.imeikeywordlocalcnMapper.class);
System.out.println("imeikeywordlocalcnReducer is beginning################");
job.setReducerClass(imeikeywordlocalcn.imeikeywordlocalcnReducer.class);
MultipleOutputs.addNamedOutput(job, "imeikeywordlocalcn1", TextOutputFormat.class, Text.class, Text.class);
MultipleOutputs.addNamedOutput(job, "imeikeywordlocalcncitycnameid", TextOutputFormat.class, Text.class, Text.class);
FileOutputFormat.setOutputPath(job, outputPath);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(10);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
String inputPath = otherArgs[0];
String outpath = otherArgs[1];
// String outpath2 = otherArgs[2];
String date = otherArgs[2];
conf.set("mapreduce.job.queuename", "root.offline.hdp_teu_search.normal");
conf.set("mapreduce.task.timeout", "0");
conf.set("RunMode", "Online");
// conf.set("ScfQps", otherArgs[3]);
// conf.set("ReduceTask", otherArgs[4]);
String[] args1={
"-i",inputPath,
"-o",outpath,
"-d",date};
ToolRunner.run(conf, new imeikeywordlocalcn(), args1);
}
}
说明:
1.实际会在输出路径下创建两个文件夹,分别是imeikeywordlocalcn1和imeikeywordlocalcncitycnameid
2.使用多文件输出一定要clean,不然每次执行输出的文件大小不一样,非正常MR流程