import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.UDF;
public class GroupByUrl extends UDF {
public String evaluate(String url) {
String result;
if (url.contains("utm_source=baidu")) {
result = "baidu";
} else if (url.contains("utm_source=360")) {
result = "360";
} else if (url.contains("utm_source=sogou")) {
result = "sogou";
} else if (url.contains("utm_source=bdpzpc")) {
result = "bdpzpc";
} else if (!url.contains("utm_source")) {
result = "self";
} else {
result = "none";
}
String regEx = "(utm_term=.*)\\&|(utm_term=.*)$";
Pattern pattern = Pattern.compile(regEx);
Matcher m = pattern.matcher(url);
String rslt = "";
if (m.find()) {
rslt = m.group();
rslt = rslt.split("=")[1];
if (!rslt.trim().equals("") && rslt.charAt(rslt.length() - 1) == '&') {
rslt = rslt.substring(0, rslt.length() - 1);
}
}
return result + " " + rslt;
}
}
统计关键词
最新推荐文章于 2023-07-05 15:25:13 发布