UDF:
第一个udf ---- 去除引号
add jar /opt/datas/hiveudf2.jar ;
create temporary function my_removequotes as "com.beifeng.senior.hive.udf.RemoveQuotesUDF" ;
insert overwrite table default.bf_log_comm select my_removequotes(remote_addr), my_removequotes(time_local), my_removequotes(request), my_removequotes(http_referer) from default.bf_log_src ;
select * from bf_log_comm limit 5 ;
--------------------
/** * 自定义UDF函数 * 传入一个json形式的字符串,获取指定字段,返回改字段的值 */
----------------
/** * 将两个字段拼接起来(使用指定的分隔符) */
-------------------
/** * random_prefix() 添加随机数前缀 * @author Administrator * */
-----------------
** * 去除随机前缀 * @author Administrator * */
---------------------
第二个 UDF ---- 处理日期时间字段 SimpleDateFormat
31/Aug/2015:00:04:37 +0800 ----》 20150831000437
---------------------------
第三个UDF ------ 求年龄
package udf;
import com.google.common.base.Strings;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.Calendar;
public class birthday2Age extends UDF {
public int evaluate(String birth){
//1、判断传入参数
if (Strings.isNullOrEmpty(birth)){
return -1;
}
//2、拆分生日,拆除生日的年、月、日
String[] birthdays = birth.split("-");
int birthYear = Integer.parseInt(birthdays[0]);
int birthMonth = Integer.parseInt(birthdays[1]);
int birthDay = Integer.parseInt(birthdays[2]);
//3、获取当前的时间
Calendar calendar = Calendar.getInstance();
//4、获取当前的 年、月、日
int nowYear = calendar.get(Calendar.YEAR);
int nowMonth = calendar.get(Calendar.MONTH) + 1;
int nowDay = calendar.get(Calendar.DAY_OF_MONTH);
//5、计算年龄
int age = nowYear - birthYear;
//判断月份
if (nowMonth < birthMonth){
age -=1;
} else if (nowMonth == birthMonth && nowDay < birthDay){
age -=1;
}
return age;
}
public static void main(String[] args) {
System.out.println(new birthday2Age().evaluate("1980-08-10"));
}
}
------------------------
第四个UDF ---------- 解析log
package udf;
import com.google.common.base.Strings;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @Description
* @Author cqh <caoqingghai@1000phone.com>
* @Version V1.0
* @Since 1.0
* @Date 2019/4/22 10:22
*/
public class LogParser extends UDF {
public String evaluate(String log) throws Exception{
if (Strings.isNullOrEmpty(log)){
return null;
}
//用于拼接字符串
StringBuilder sb = new StringBuilder();
//定义一个正则表达式,匹配我们的日志
// "^([0-9.]+\\d+) - - \\[(.* \\+\\d+)\\] .+(GET|POST) (.+) (HTTP)\\S+ (\\d+).+\\\"(\\w+).+$";
//"^([0-9.]+\\d+) - - \\[(.* \\+[0-9]+)\\] .+(GET|POST) (.+) (HTTP)\\S+ (\\d+).+\\\"(\\w+).+$";
String reg ="^([0-9.]+\\d+) - - \\[(.+ \\+\\d+)\\] .+(GET|POST) (.+) (HTTP)\\S+ (\\d+).+\\\"(\\w+).+$";
String a="^([0-9.]+\\d+) - - \\[(.+ \\+\\d+)\\].+(GET|POST) (.+) (HTTP)\\S+ (\\d+).+\\\"(\\w+).+$";
//将表达式进行编译
Pattern pattern = Pattern.compile(a);
//构造匹配器
Matcher matcher = pattern.matcher(log);
if (matcher.find()){
//现获取总匹配上的字段
int count = matcher.groupCount();
for (int i=1;i<=count;i++){
if (i==2){
//获取31/Jan/2012:00:02:32 +0800,进行格式转换
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd hhmmss");
Date d = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z", Locale.ENGLISH).parse(matcher.group(i));
String s = sdf.format(d);
sb.append(s+"\t");
}else {
sb.append(matcher.group(i)+"\t");
}
}
}
return sb.toString();
}
public static void main(String[] args) throws Exception{
System.out.println(new LogParser().evaluate("220.181.108.151 - - [31/Jan/2012:00:02:32 +0800] \\\"GET /home.php?mod=space&uid=158&do=album&view=me&from=space HTTP/1.1\\\" 200 8784 \\\"-\\\" \\\"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)\\\""));
}
}
-------------------------
第五个UDF ------- 公共字段
分割line,取出json对象(判断)
遍历传入的key,从json对象取出值
sb拼接返回各key对应的值 + 事件json + 服务时间
package com.atguigu.udf;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;
public class BaseFieldUDF extends UDF {
public String evaluate(String line, String jsonkeysString) {
// 0 准备一个sb
StringBuilder sb = new StringBuilder();
// 1 切割jsonkeys mid uid vc vn l sr os ar md
String[] jsonkeys = jsonkeysString.split(",");
// 2 处理line 服务器时间 | json
String[] logContents = line.split("\\|");
// 3 合法性校验
if (logContents.length != 2 || StringUtils.isBlank(logContents[1])) {
return "";
}
// 4 开始处理json
try {
JSONObject jsonObject = new JSONObject(logContents[1]);
// 获取cm里面的对象
JSONObject base = jsonObject.getJSONObject("cm");
// 循环遍历取值
for (int i = 0; i < jsonkeys.length; i++) {
String filedName = jsonkeys[i].trim();
if (base.has(filedName)) {
sb.append(base.getString(filedName)).append("\t");
} else {
sb.append("\t");
}
}
sb.append(jsonObject.getString("et")).append("\t");
sb.append(logContents[0]).append("\t");
} catch (JSONException e) {
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
String line = "1541217850324|{\"cm\":{\"mid\":\"m7856\",\"uid\":\"u8739\",\"ln\":\"-74.8\",\"sv\":\"V2.2.2\",\"os\":\"8.1.3\",\"g\":\"P7XC9126@gmail.com\",\"nw\":\"3G\",\"l\":\"es\",\"vc\":\"6\",\"hw\":\"640*960\",\"ar\":\"MX\",\"t\":\"1541204134250\",\"la\":\"-31.7\",\"md\":\"huawei-17\",\"vn\":\"1.1.2\",\"sr\":\"O\",\"ba\":\"Huawei\"},\"ap\":\"weather\",\"et\":[{\"ett\":\"1541146624055\",\"en\":\"display\",\"kv\":{\"goodsid\":\"n4195\",\"copyright\":\"ESPN\",\"content_provider\":\"CNN\",\"extend2\":\"5\",\"action\":\"2\",\"extend1\":\"2\",\"place\":\"3\",\"showtype\":\"2\",\"category\":\"72\",\"newstype\":\"5\"}},{\"ett\":\"1541213331817\",\"en\":\"loading\",\"kv\":{\"extend2\":\"\",\"loading_time\":\"15\",\"action\":\"3\",\"extend1\":\"\",\"type1\":\"\",\"type\":\"3\",\"loading_way\":\"1\"}},{\"ett\":\"1541126195645\",\"en\":\"ad\",\"kv\":{\"entry\":\"3\",\"show_style\":\"0\",\"action\":\"2\",\"detail\":\"325\",\"source\":\"4\",\"behavior\":\"2\",\"content\":\"1\",\"newstype\":\"5\"}},{\"ett\":\"1541202678812\",\"en\":\"notification\",\"kv\":{\"ap_time\":\"1541184614380\",\"action\":\"3\",\"type\":\"4\",\"content\":\"\"}},{\"ett\":\"1541194686688\",\"en\":\"active_background\",\"kv\":{\"active_source\":\"3\"}}]}";
String x = new BaseFieldUDF().evaluate(line, "mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t");
System.out.println(x);
}
}
------------------------------------------------------
UDAF:
字符串拼接
------------------------------------------------------
UDTF:
第一个UDTF ------ 事件数据
初始化返回值和类型
遍历json数组,依次输出 事件名称、事件数据
public class EventJsonUDTF extends GenericUDTF {
//该方法中,我们将指定输出参数的名称和参数类型:
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("event_name");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("event_json");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
//输入1条记录,输出若干条结果
@Override
public void process(Object[] objects) throws HiveException {
// 获取传入的et
String input = objects[0].toString();
// 如果传进来的数据为空,直接返回过滤掉该数据
if (StringUtils.isBlank(input)) {
return;
} else {
try {
// 获取一共有几个事件(ad/facoriters)
JSONArray ja = new JSONArray(input);
if (ja == null)
return;
// 循环遍历每一个事件
for (int i = 0; i < ja.length(); i++) {
String[] result = new String[2];
try {
// 取出每个的事件名称(ad/facoriters)
result[0] = ja.getJSONObject(i).getString("en");
// 取出每一个事件整体
result[1] = ja.getString(i);
} catch (JSONException e) {
continue;
}
// 将结果返回
forward(result);
}
} catch (JSONException e) {
e.printStackTrace();
}
}
}
//当没有记录处理的时候该方法会被调用,用来清理代码或者产生额外的输出
@Override
public void close() throws HiveException {
}
}
、
第二个UDTF ----》
类似一行转换成多行 lateral view功能,
数据样式解析:21::Ge(1995)::Action|Comedy|Drama
package com.ghgj.hive.udf;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* 一行转换成多行 lateral view
*/
public class MovieTypeUDTF extends GenericUDTF{
private PrimitiveObjectInspector stringOI = null;
/**
* 做初始化:
* 1、做参数个数的校验
* 2、做参数类型的校验
*
* 数据样式:21::Get Shorty (1995)::Action|Comedy|Drama
*/
@Override
public StructObjectInspector initialize(ObjectInspector[] argOIs)
throws UDFArgumentException {
// 第一,判断参数类型是不是基本类型,
// 第二,判断这个基本类型是不是String类型
if(argOIs[0].getCategory() != ObjectInspector.Category.PRIMITIVE &&
((PrimitiveObjectInspector)argOIs[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING){
throw new UDFArgumentException("myudtf need a string parameter");
}
stringOI = (PrimitiveObjectInspector)argOIs[0];
// 存储字段名称
List<String> fieldNames = new ArrayList<String>();
// 存储字段类型
List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("movieid");
fieldNames.add("movietitle");
fieldNames.add("movietype");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
/**
*
*/
@Override
public void process(Object[] args) throws HiveException {
// TODO Auto-generated method stub
/**
* string = 21::Get Shorty (1995)::Action|Comedy|Drama
*/
String string = stringOI.getPrimitiveJavaObject(args[0]).toString();
String[] fields = string.split("::");
String[] movieTypes = fields[2].split("\\|");
int length = movieTypes.length;
for(int i=0; i<length; i++){
String[] string2 = new String[]{fields[0],fields[1],movieTypes[i]};
forward(string2);
}
}
@Override
public void close() throws HiveException {
// TODO Auto-generated method stub
}
}