hive实现自定义公式解析
import org.apache.hadoop.hive.ql.exec.UDF;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UDFCalculate extends UDF {
public String evaluate(String rule,String replace_rule,String index_value) {
//对原始公式中需要进行替换的部分进行公式替换
if(!"".equals(replace_rule)){//需要进行公式替换时才会进行替换
String[] split = replace_rule.split(",");
for(int i=0;i<split.length;i+=2){
rule = rule.replace(split[i],"("+split[i+1]+")");
}
}
//将指标编码替换为值
//指标编码:字母开头,数字结尾
String[] sign = rule.split("[A-Z]+\\d*_\\d*");
String[] value = index_value.split(",");
String res = "";
for(int i=0;i<value.length;i++){
res = res+sign[i]+value[i];
}
if(sign.length>value.length)
res = res + sign[sign.length-1];//如果运算符号比数值多,说明最后还有一个符号需要拼接
return process(res);
}
public String process(String formula){
//创建一个栈
Stack<String> stack = new Stack<String>();
Pattern p = Pattern.compile("^(\\d+\\.?\\d*)"); //开头一定是数字开头,1或者多位,中间可能有小数点,后面可能还有数字,只需要捕获一个捕获组即可
//依次入栈
int length = formula.length();
String now_str = "";//当前操作字符串
String next_str = "";//下一个将要操作的字符串
String fore_str = "";//用于接收栈顶字符串
String expression = "";//用于组装需要进行计算的表达式
Object res = null; //用于接收表达式运算结果
while(length!=0){
if(formula.matches("^[+-/*)(].*")){//字符串以+-*/()开头时,只截取一个字符串
now_str = formula.substring(0,1); //截取开头字符串
formula=formula.substring(1);
//判断读到的是什么符号+,-,( 这三个符号入栈
if("+".equals(now_str)||"-".equals(now_str)||"(".equals(now_str)){
stack.push(now_str);
}
//读到的符号是*和/
if("*".equals(now_str)||"/".equals(now_str)){
//判断将要读到的下一个是否是数字
Matcher matcher1 = p.matcher(formula); //捕获组
if(matcher1.find()) { //捕获到数据,说明下一个字符串将会是一个数字
next_str = matcher1.group(1);
//从栈顶获取一个数值,将两个数值做运算
fore_str = stack.pop();
expression = fore_str+now_str+next_str;
try {
res = calculate(expression);
if(!(res instanceof Integer)&&(Double.isNaN((Double)res)||Double.isInfinite((Double)res)))
res = 0;
}catch (ScriptException e){//抛出异常,说明除数为0,此时说明该指标不存在,返回0
e.printStackTrace();
}
//将计算的结果值入栈
stack.push(res.toString());
//并且将公式中的next_str 长度截取掉
formula=formula.substring(next_str.length());
}else if(formula.startsWith("(")){//如果*和/后面跟着的是(,也只可能有这个,运算符号后面要么是数字要么是(左括号
//说明此时的*以及/不能直接运算,需要先计算后面的括号内容,先将此时的运算符号入栈
stack.push(now_str);
}
}
if(")".equals(now_str)){//当前读取到的是一个右括号,说明一个小括号已经运算完毕
//将栈中保存的栈顶到左括号的元素全部出栈
fore_str = stack.pop();
expression = now_str;
while(!"(".equals(fore_str)){ //当出栈元素不是(括号时,一直循环出栈
expression = fore_str + expression;
fore_str = stack.pop();
}
expression = fore_str + expression;//将左括号拼接上
try {
res = calculate(expression);//进行运算
}catch (Exception e){
res = 0;//此时两个()中是不可能出现乘法和除法的,因为如果有数A*数B的话,已经在上面的判断中直接做过运算了
}
//继续判断(前面是否是*或者是/,适用于该情况 num1/(num2+num3+num4) 情况,此种情况下,也不让抛出异常,而是返回0
if(!stack.empty()){//栈非空,避免找到的左括号已经是表达式的第一位了
fore_str = stack.pop();//继续出栈
if("*".equals(fore_str)||"/".equals(fore_str)){ //再从栈顶获取一个数字
expression = stack.pop()+fore_str+res.toString();//用上面计算的res括号内的结果和此时的栈顶做运算
try {
res = calculate(expression);
if(!(res instanceof Integer)&&(Double.isNaN((Double)res)||Double.isInfinite((Double)res)))
res = 0;
} catch (ScriptException e) {
e.printStackTrace();
}
}else{ //此时找到的栈顶元素是一个+或者-号时
stack.push(fore_str);//将读出的运算符号重新入栈
}
}
stack.push(res.toString());//将小括号的计算结果推入栈中
}
}else{ //不是运算符开头,说明是一个数字开头
Matcher matcher1 = p.matcher(formula); //捕获组
if(matcher1.find()){ //捕获到数据
now_str = matcher1.group(1); //第一个捕获组就是当前正在操作的数字
//将该数字入栈
stack.push(now_str);
//将该数字的部分截取掉
formula=formula.substring(now_str.length());
}
}
length=formula.length();
}//循环结束时,说明formula公式已经解析完毕,所有的运算中间结果都存储在栈中,全部读出,统一运算,此时只可能有+和-这样的基础运算
expression = "";
while(!stack.empty()){
expression = stack.pop()+expression;
}
try {
res = calculate(expression);
} catch (ScriptException e) {
System.out.println("如果程序运行正常,是不可能出现该异常!!!");
e.printStackTrace();
}
assert res != null;
//System.out.println(res.toString());
return res.toString();
}
//传入一个字符串,返回计算结果
public Object calculate(String str) throws ScriptException {
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("js");
Object result = engine.eval(str);
return result;
}
}
mavan 依赖:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>02_hive_自定义公式解析</artifactId>
<groupId>cn.ruanwenfu</groupId>
<version>1.0-SNAPSHOT</version>
<relativePath>../02_hive_自定义公式解析/pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hive</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-common</artifactId>
<version>1.2.0</version>
</dependency>
</dependencies>
</project>
操作步骤:
计算公式:
(TPLIFE_001+TPLIFE_002*TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)*0.75
create table rules
(
use_company string,
rule string,
index_code string
);
规则配置表rules
适用公司use_company 规则rule 待计算指标index_code
insert into rules values(‘TPPRTY’,’(TPPRTY_001+TPPRTY_002*TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75’,‘TPLIFE_100’);
insert into rules
select ‘TPLIFE’,’(TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75’,‘TPLIFE_100’
union all
select ‘TPLIFE’,‘TPLIFE_050+TPLIFE_051’,‘TPLIFE_001’
union all
select ‘TPLIFE’,‘TPLIFE_052+TPLIFE_053’,‘TPLIFE_050’
union all
select ‘TPLIFE’,'TPLIFE_030TPLIFE_031’,‘TPLIFE_006’
union all
select ‘TPPRTY’,’(TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75’,‘TPPRTY_100’
增加一条数据:
insert into rules
values(‘TPLIFE’,‘TPPRTY_030*TPPRTY_031’,‘TPLIFE_006’)
REGEXP_REPLACE
语法: regexp_replace(string A, string B, string C)
操作类型: strings
返回值: string
说明: 将字符串A中的符合java正则表达式B的部分替换为C。
hive> select regexp_replace(‘h234ney’, ‘\d+’, ‘o’);
OK
honey
\d表示数字,+号表示1或n,第一个\表示转义
*整理正则规则表
select regexp_replace(’(1+2)3/4-50.75/(9+10)’, ‘[+*/-]’, ‘,’);
select regexp_replace(regexp_replace(’(1+2)3/4-50.75/(9+10)’, ‘[+*/-]’, ‘,’),’[()]’,’’);
1、对公式进行正则替换
select
use_company,index_code,
regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’) as index_combination
from rules;
TPLIFE_001,TPLIFE_002,TPLIFE_003,TPLIFE_004,TPLIFE_005,TPLIFE_006,0.75
==============================
TPLIFE TPLIFE_100 TPLIFE_001,TPLIFE_002,TPLIFE_003,TPLIFE_004,TPLIFE_005,TPLIFE_006,0.75
TPLIFE TPLIFE_001 TPLIFE_050,TPLIFE_051
TPLIFE TPLIFE_050 TPLIFE_052,TPLIFE_053
TPLIFE TPLIFE_006 TPLIFE_030,TPLIFE_031
TPPRTY TPPRTY_100 TPPRTY_001,TPPRTY_002,TPPRTY_003,TPPRTY_004,TPPRTY_005,TPPRTY_006,0.75
2、将公式字符串转array数组
select use_company,index_code,split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’) from rules;
3、炸裂explode函数,适用于数组,行转列
select use_company,index_code,rule,table_alias.from_code from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code;
–报错UDTF’s are not supported outside the SELECT clause, nor nested in expressions
炸裂函数,在select子句中是不能有其他字段的,需要结合侧视图解除以上限制
select explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) from rules; --可以执行
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_001
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_002
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_003
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_004
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_005
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_006
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 0.75
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 TPLIFE_050
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 TPLIFE_051
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_052
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_053
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_030
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_031
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002*TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_001
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_002
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_003
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_004
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_005
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_006
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75 0.75
4、处理递归关系(有可能指标A还依赖与指标B的计算结果)
4.1 创建临时结果表
字段:
use_company 适用公司
index_code 目标指标编码
rule 指标加工原始规则
replace_rule 需替换规则
from_code 来源指标编码
code_value 来源指标值
create table tmp(
use_company string comment '适用公司 ',
index_code string comment '目标指标编码 ',
rule string comment '指标加工原始规则 ',
replace_rule string comment '需替换规则 ',
from_code string comment '来源指标编码 ',
code_value string comment '来源指标值 ’
);
4.2 先将原始记录插入临时表
insert overwrite table tmp
select use_company,index_code,rule,null as replace_rule,
table_alias.from_code,’’ as code_value
from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code;
select count(*) from tmp; --20 rows 这里比较耗时 能不能在SQL执行后返回影响的行数
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
)
select t1.index_code,t1.rule,t2.rule,nvl(t2.from_code,t1.from_code) as from_code,
regexp_replace(t1.rule,t2.index_code,concat(’(’,t2.rule,’)’)) as rule
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
–一次递归:上面语句有逻辑错误,不能再这里替换规则,如果指标A和指标B同时来源于其他指标,分别替换规则之后无法确定最后使用哪个规则
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
)
t1.index_code,t1.rule,nvl(t2.index_code,t1.from_code) as replace_code,
nvl(t2.rule,t1.from_code) as replace_rule,nvl(t2.from_code,t1.from_code) as from_code
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
4.3第一次进行递归,自关联(自关联时考虑null值,关联列为null值怎么办)
with tmp1 as
(
select * from tmp
)
insert overwrite table tmp
select
t1.use_company, --适用公司
t1.index_code, --需加工目标指标编码
t1.rule, --指标加工原始规则
concat_ws(’,’,t1.replace_rule,t2.index_code,t2.rule) as replace_rule, --将改行之前的替换规则和当前的替换规则进行组合 格式 替换指标,替换规则
nvl(t2.from_code,t1.from_code) as from_code, --来源指标,如果t2为空,说明t1来源指标已经是原始指标,无需替换
‘’ as index_value
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
4.4再次递归
如果select count(*) from tmp; 得到的记录数在增大的话,需要继续递归
with tmp1 as
(
select * from tmp
)
insert overwrite table tmp
select
t1.use_company, --适用公司
t1.index_code, --需加工目标指标编码
t1.rule, --指标加工原始规则
concat_ws(’,’,t1.replace_rule,t2.index_code,t2.rule) as replace_rule, --将改行之前的替换规则和当前的替换规则进行组合 格式 替换指标,替换规则
nvl(t2.from_code,t1.from_code) as from_code, --来源指标,如果t2为空,说明t1来源指标已经是原始指标,无需替换
‘’ as index_value
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
4.5再次递归查看是否还需要继续递归
with tmp1 as
(
select * from tmp
)
insert overwrite table tmp
select
t1.use_company, --适用公司
t1.index_code, --需加工目标指标编码
t1.rule, --指标加工原始规则
concat_ws(’,’,t1.replace_rule,t2.index_code,t2.rule) as replace_rule, --将改行之前的替换规则和当前的替换规则进行组合 格式 替换指标,替换规则
nvl(t2.from_code,t1.from_code) as from_code, --来源指标,如果t2为空,说明t1来源指标已经是原始指标,无需替换
‘’ as index_value
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
不会出现错误结果,行数不会再增长
hive 开启列名显示
set hive.cli.print.header=true;
set hive.resultset.use.unique.column.names=false;
–二次循环,二次递归
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
),
tmp2 as (
select t1.index_code,t1.use_company,t1.rule,concat(nvl(t2.index_code,t1.from_code),nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company
)
select t1.index_code,t1.rule,concat(t1.transformer,’,’,nvl(t2.index_code,t1.from_code),nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp2 t1
left join tmp2 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
每次递归将结果写入一个表中,判断该表的count(),如果表的count()还在持续增长说明递归没有结束,
可以设置一个递归最大次数,避免递归层数过多
具体实现放在后面
5、关联指标表获取每个指标的value值
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
),
tmp2 as (
select t1.index_code,t1.use_company,t1.rule,concat(nvl(t2.index_code,t1.from_code),’,’,nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company
)
insert overwrite table result
select t1.index_code,t1.rule,concat(t1.transformer,’,’,nvl(t2.index_code,t1.from_code),’,’,nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp2 t1
left join tmp2 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;
create table result
(
index_code string,
origin_rule string,
replace_rule string,
from_code string
);
select r.*,i.index_value from result r
left join fact i on r.use_company=i.use_company and r.from_index=i.index_code;
模拟关联数据:
select r.*,0 from result r;
创建指标表:
create table index_value_table(
use_company string,
index_code string,
index_value string
)
插入指标数据:
insert into index_value_table
select ‘TPLIFE’,‘TPLIFE_052’,‘1.2’
union all
select ‘TPLIFE’,‘TPLIFE_053’,‘5.7’
union all
select ‘TPLIFE’,‘TPLIFE_051’,‘3.2’
union all
select ‘TPLIFE’,‘TPLIFE_002’,‘7.7’
union all
select ‘TPLIFE’,‘TPLIFE_003’,‘10’
union all
select ‘TPLIFE’,‘TPLIFE_004’,‘9.7’
union all
select ‘TPLIFE’,‘TPLIFE_005’,‘1.2’
union all
select ‘TPLIFE’,‘TPLIFE_030’,‘6.6’
union all
select ‘TPLIFE’,‘TPLIFE_031’,‘5.3’
union all
select ‘TPPRTY’,‘TPPRTY_001’,‘1.2’
union all
select ‘TPPRTY’,‘TPPRTY_002’,‘8.8’
union all
select ‘TPPRTY’,‘TPPRTY_003’,‘7.5’
union all
select ‘TPPRTY’,‘TPPRTY_004’,‘4.4’
union all
select ‘TPPRTY’,‘TPPRTY_005’,‘2.2’
insert overwrite table tmp
select
t1.use_company,
t1.index_code,
t1.rule,
t1.replace_rule,
t1.from_code,
t2.index_value as code_value
from
tmp t1
left join index_value_table t2
on t1.use_company=t2.use_company and t1.from_code=t2.index_code;
6、将多行合并成为一行
use_company index_code rule replace_rule from_code code_value
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_001,TPLIFE_050+TPLIFE_051,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_052 1.2
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_001,TPLIFE_050+TPLIFE_051,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_053 5.7
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_001,TPLIFE_050+TPLIFE_051 TPLIFE_051 3.2
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_002 7.7
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_003 10
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_004 9.7
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_005 1.2
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_006,TPLIFE_030TPLIFE_031 TPLIFE_030 6.6
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_006,TPLIFE_030TPLIFE_031 TPLIFE_031 5.3
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 0.75 NULL
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 ,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_052 1.2
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 ,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_053 5.7
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 TPLIFE_051 3.2
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_052 1.2
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_053 5.7
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_030 6.6
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_031 5.3
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002*TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_001 1.2
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_002 8.8
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_003 7.5
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_004 4.4
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_005 2.2
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_006 6.5
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75 0.75 NULL
根据需要加工的index_code以及use_company进行分组
select
use_company,index_code,rule,concat_ws(’,’,collect_set(case when replace_rule<>’’ then replace_rule else null end)) as replace_rule,
concat_ws(’,’,collect_list(case when code_value is null and from_code rlike ‘^\w’ then ‘0’ else code_value end)) as index_value
from tmp
group by use_company,index_code,rule;
collect_list 不去重
collect_set 去重
7、通过自定义函数UDF进行指标加工
calculate(rule,replace_rule,index_value)
select
t.use_company,
t.index_code,
formula_parser(t.rule,nvl(t.replace_rule,’’),t.index_value) as index_value
from
(
select
use_company,index_code,rule,concat_ws(’,’,collect_set(case when replace_rule<>’’ then replace_rule else null end)) as replace_rule,
concat_ws(’,’,collect_list(case when code_value is null and from_code like ‘T%’ then ‘0’ else code_value end)) as index_value
from tmp
group by use_company,index_code,rule
) t;
java内置的运算类:
String str = “43*(2 + 1.4)+232/0+9/(11*0)”;
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName(“js”);
Object result = engine.eval(str);
System.out.println(“结果类型:” + result.getClass().getName() + “,计算结果:” + result);
但是不符合要求,不好处理除数为0的情况
已完成自定义四则运算对象
8、如何保证value的顺序和formula的顺序是一样的?
用map?
hive中的map数据结构
{“name”:“zhangsan”,“age”:“25”,“addr”:“bj”}
不用,collect_list的时候顺序和公式里面的顺序是一样的。
9、HIVE UDF函数写法
9.1新建java工程,导入hive相关包,导入hive相关的lib 创建一个新的maven module
配置maven依赖:
org.apache.hive
hive-exec
1.2.0
org.apache.hive
hive-common
1.2.0
定义一个类
cn.ruanwenfu.hive.Repeat;
实现其中的evaluate方法即可
package cn.tedu.hive;
import org.apache.hadoop.hive.ql.exec.UDF;
// abc 5 -> abcabcabcabcabc
public class Repeat extends UDF {
// Hive自动去寻找这个evaluate函数
// 根据场景要求决定这个函数的参数和返回值类型
public String evaluate(String str, int n) {
if (n < 1)
throw new IllegalArgumentException("个数至少为1!!!");
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.append(str);
}
return sb.toString();
}
}
如何打包:
1、Project Structure
2、Artifacts 选择加号 选择JAR 选择From modules with dependencies
进入create jar from modules
Module选择刚才写的module
其他地方不用选,没有Main Class,选择extract to the target JAR
选择OK
选择APPLY
选择Build
Build Artifacts
选择刚才的module名.jar
选择Build
在out目录下出现了对应的jar包
上传至linux,随便上传一个目录
为了保证上传的jar包中的MATE-INF目录下的文件校验通过,在linux窗口执行zip -d 02_hive_自定义函数udf.jar ‘META-INF/.SF’ ‘META-INF/.RSA’ ‘META-INF/*SF’
在Hive中:
添加jar包:add jar /home/software/02_hive_自定义函数udf.jar;
创建临时函数,给函数起名,并且绑定这个函数对应的类:create temporary function repeatstring as ‘cn.tedu.hive.Repeat’;
调用函数:select repeatstring(“abc”, 5);
create temporary function formula_parser as ‘UDFCalculate’
9.2创建类继承UDF
9.3自己编写一个evaluate方法,返回值和参数任意
import org.apache.hadoop.hive.ql.exec.UDF;
public class ToUpper extends UDF{
public String evaluate(String str){ //接收的参数为1个string,返回值也是一个String
return str.toUpperCase();
}
}
9.4为了能让mapreduce处理,String要用Text处理。(这个是啥意思)
9.5 将写好的类打成jar包,上传到linux中
在hive命令行下,向hive注册UDF:add jar /xxxx/xxxx.jar
例如add jar /root/work/data/udf.jar
9.6在hive命令行下,为当前udf起一个名字:create temporary function fname as ‘类的全路径名’;
create temporary function fname as ‘类的全路径名’;
create temporary function mytoup as ‘cn.ruanwenfu.udf.ToUpper’;
9.7直接在hsql中使用该函数
select mytoup(‘dsafjdksafjl’);