hive实现自定义公式解析

最新推荐文章于 2023-03-20 14:24:07 发布

qq_34352013

最新推荐文章于 2023-03-20 14:24:07 发布

阅读量533

点赞数 1

分类专栏： hive java 文章标签： java

本文链接：https://blog.csdn.net/qq_34352013/article/details/106991230

版权

java 同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

hive

2 篇文章 0 订阅

订阅专栏

hive实现自定义公式解析

import org.apache.hadoop.hive.ql.exec.UDF;

import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UDFCalculate extends UDF {

    public String evaluate(String rule,String replace_rule,String index_value) {
        //对原始公式中需要进行替换的部分进行公式替换
        if(!"".equals(replace_rule)){//需要进行公式替换时才会进行替换
            String[] split = replace_rule.split(",");
            for(int i=0;i<split.length;i+=2){
                rule = rule.replace(split[i],"("+split[i+1]+")");
            }
        }
        //将指标编码替换为值
        //指标编码:字母开头,数字结尾
        String[] sign = rule.split("[A-Z]+\\d*_\\d*");
        String[] value = index_value.split(",");
        String res = "";
        for(int i=0;i<value.length;i++){
            res = res+sign[i]+value[i];
        }
        if(sign.length>value.length)
            res = res + sign[sign.length-1];//如果运算符号比数值多,说明最后还有一个符号需要拼接
        return process(res);
    }

    public String process(String formula){
        //创建一个栈
        Stack<String> stack = new Stack<String>();
        Pattern p = Pattern.compile("^(\\d+\\.?\\d*)"); //开头一定是数字开头,1或者多位,中间可能有小数点,后面可能还有数字,只需要捕获一个捕获组即可
        //依次入栈
        int length = formula.length();
        String now_str = "";//当前操作字符串
        String next_str = "";//下一个将要操作的字符串
        String fore_str = "";//用于接收栈顶字符串
        String expression = "";//用于组装需要进行计算的表达式
        Object res = null; //用于接收表达式运算结果
        while(length!=0){
            if(formula.matches("^[+-/*)(].*")){//字符串以+-*/()开头时,只截取一个字符串
                now_str = formula.substring(0,1); //截取开头字符串
                formula=formula.substring(1);
                //判断读到的是什么符号+,-,( 这三个符号入栈
                if("+".equals(now_str)||"-".equals(now_str)||"(".equals(now_str)){
                    stack.push(now_str);
                }
                //读到的符号是*和/
                if("*".equals(now_str)||"/".equals(now_str)){
                    //判断将要读到的下一个是否是数字
                    Matcher matcher1 = p.matcher(formula); //捕获组
                    if(matcher1.find()) { //捕获到数据,说明下一个字符串将会是一个数字
                        next_str = matcher1.group(1);
                        //从栈顶获取一个数值,将两个数值做运算
                        fore_str = stack.pop();
                        expression = fore_str+now_str+next_str;
                        try {
                            res = calculate(expression);
                            if(!(res instanceof Integer)&&(Double.isNaN((Double)res)||Double.isInfinite((Double)res)))
                                res = 0;
                        }catch (ScriptException e){//抛出异常,说明除数为0,此时说明该指标不存在,返回0
                            e.printStackTrace();
                        }
                        //将计算的结果值入栈
                        stack.push(res.toString());
                        //并且将公式中的next_str 长度截取掉
                        formula=formula.substring(next_str.length());
                    }else if(formula.startsWith("(")){//如果*和/后面跟着的是(,也只可能有这个,运算符号后面要么是数字要么是(左括号
                        //说明此时的*以及/不能直接运算,需要先计算后面的括号内容,先将此时的运算符号入栈
                        stack.push(now_str);
                    }
                }
                if(")".equals(now_str)){//当前读取到的是一个右括号,说明一个小括号已经运算完毕
                    //将栈中保存的栈顶到左括号的元素全部出栈
                    fore_str = stack.pop();
                    expression = now_str;
                    while(!"(".equals(fore_str)){ //当出栈元素不是(括号时,一直循环出栈
                        expression = fore_str + expression;
                        fore_str = stack.pop();
                    }
                    expression = fore_str + expression;//将左括号拼接上
                    try {
                        res = calculate(expression);//进行运算
                    }catch (Exception e){
                        res = 0;//此时两个()中是不可能出现乘法和除法的,因为如果有数A*数B的话,已经在上面的判断中直接做过运算了
                    }
                    //继续判断(前面是否是*或者是/,适用于该情况 num1/(num2+num3+num4) 情况,此种情况下,也不让抛出异常,而是返回0
                    if(!stack.empty()){//栈非空,避免找到的左括号已经是表达式的第一位了
                        fore_str = stack.pop();//继续出栈
                        if("*".equals(fore_str)||"/".equals(fore_str)){ //再从栈顶获取一个数字
                            expression = stack.pop()+fore_str+res.toString();//用上面计算的res括号内的结果和此时的栈顶做运算
                            try {
                                res = calculate(expression);
                                if(!(res instanceof Integer)&&(Double.isNaN((Double)res)||Double.isInfinite((Double)res)))
                                    res = 0;
                            } catch (ScriptException e) {
                                e.printStackTrace();
                            }
                        }else{ //此时找到的栈顶元素是一个+或者-号时
                            stack.push(fore_str);//将读出的运算符号重新入栈
                        }
                    }
                    stack.push(res.toString());//将小括号的计算结果推入栈中
                }
            }else{ //不是运算符开头,说明是一个数字开头
                Matcher matcher1 = p.matcher(formula); //捕获组
                if(matcher1.find()){ //捕获到数据
                    now_str = matcher1.group(1); //第一个捕获组就是当前正在操作的数字
                    //将该数字入栈
                    stack.push(now_str);
                    //将该数字的部分截取掉
                    formula=formula.substring(now_str.length());
                }
            }
            length=formula.length();
        }//循环结束时,说明formula公式已经解析完毕,所有的运算中间结果都存储在栈中,全部读出,统一运算,此时只可能有+和-这样的基础运算
        expression = "";
        while(!stack.empty()){
            expression = stack.pop()+expression;
        }
        try {
            res = calculate(expression);
        } catch (ScriptException e) {
            System.out.println("如果程序运行正常,是不可能出现该异常!!!");
            e.printStackTrace();
        }
        assert res != null;
        //System.out.println(res.toString());
        return res.toString();
    }

    //传入一个字符串,返回计算结果
    public Object calculate(String str) throws ScriptException {
        ScriptEngineManager manager = new ScriptEngineManager();
        ScriptEngine engine = manager.getEngineByName("js");
        Object result = engine.eval(str);
        return result;
    }
}

mavan 依赖：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>02_hive_自定义公式解析</artifactId>
        <groupId>cn.ruanwenfu</groupId>
        <version>1.0-SNAPSHOT</version>
        <relativePath>../02_hive_自定义公式解析/pom.xml</relativePath>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <artifactId>hive</artifactId>

    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-common</artifactId>
            <version>1.2.0</version>
        </dependency>
    </dependencies>

</project>

操作步骤：
计算公式:
(TPLIFE_001+TPLIFE_002*TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)*0.75

create table rules
(
use_company string,
rule string,
index_code string
);
规则配置表rules
适用公司use_company 规则rule 待计算指标index_code

insert into rules values(‘TPPRTY’,’(TPPRTY_001+TPPRTY_002*TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75’,‘TPLIFE_100’);

insert into rules
select ‘TPLIFE’,’(TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75’,‘TPLIFE_100’
union all
select ‘TPLIFE’,‘TPLIFE_050+TPLIFE_051’,‘TPLIFE_001’
union all
select ‘TPLIFE’,‘TPLIFE_052+TPLIFE_053’,‘TPLIFE_050’
union all
select ‘TPLIFE’,'TPLIFE_030TPLIFE_031’,‘TPLIFE_006’
union all
select ‘TPPRTY’,’(TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75’,‘TPPRTY_100’

增加一条数据：
insert into rules
values(‘TPLIFE’,‘TPPRTY_030*TPPRTY_031’,‘TPLIFE_006’)

REGEXP_REPLACE
语法: regexp_replace(string A, string B, string C)
操作类型: strings
返回值: string
说明: 将字符串A中的符合java正则表达式B的部分替换为C。

hive> select regexp_replace(‘h234ney’, ‘\d+’, ‘o’);
OK
honey

\d表示数字,+号表示1或n,第一个\表示转义

*整理正则规则表

select regexp_replace(’(1+2)3/4-50.75/(9+10)’, ‘[+*/-]’, ‘,’);

select regexp_replace(regexp_replace(’(1+2)3/4-50.75/(9+10)’, ‘[+*/-]’, ‘,’),’[()]’,’’);

1、对公式进行正则替换
select
use_company,index_code,
regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’) as index_combination
from rules;

TPLIFE_001,TPLIFE_002,TPLIFE_003,TPLIFE_004,TPLIFE_005,TPLIFE_006,0.75

==============================
TPLIFE TPLIFE_100 TPLIFE_001,TPLIFE_002,TPLIFE_003,TPLIFE_004,TPLIFE_005,TPLIFE_006,0.75
TPLIFE TPLIFE_001 TPLIFE_050,TPLIFE_051
TPLIFE TPLIFE_050 TPLIFE_052,TPLIFE_053
TPLIFE TPLIFE_006 TPLIFE_030,TPLIFE_031
TPPRTY TPPRTY_100 TPPRTY_001,TPPRTY_002,TPPRTY_003,TPPRTY_004,TPPRTY_005,TPPRTY_006,0.75

2、将公式字符串转array数组
select use_company,index_code,split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’) from rules;

3、炸裂explode函数,适用于数组,行转列
select use_company,index_code,rule,table_alias.from_code from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code;

–报错UDTF’s are not supported outside the SELECT clause, nor nested in expressions
炸裂函数,在select子句中是不能有其他字段的,需要结合侧视图解除以上限制

select explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) from rules; --可以执行

TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_001
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_002
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_003
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_004
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_005
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_006
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 0.75
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 TPLIFE_050
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 TPLIFE_051
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_052
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_053
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_030
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_031
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002*TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_001
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_002
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_003
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_004
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_005
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_006
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75 0.75

4、处理递归关系(有可能指标A还依赖与指标B的计算结果)
4.1 创建临时结果表
字段:
use_company 适用公司
index_code 目标指标编码
rule 指标加工原始规则
replace_rule 需替换规则
from_code 来源指标编码
code_value 来源指标值

create table tmp(
use_company string comment '适用公司 ',
index_code string comment '目标指标编码 ',
rule string comment '指标加工原始规则 ',
replace_rule string comment '需替换规则 ',
from_code string comment '来源指标编码 ',
code_value string comment '来源指标值 ’
);

4.2 先将原始记录插入临时表
insert overwrite table tmp
select use_company,index_code,rule,null as replace_rule,
table_alias.from_code,’’ as code_value
from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code;

select count(*) from tmp; --20 rows 这里比较耗时能不能在SQL执行后返回影响的行数

with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
)
select t1.index_code,t1.rule,t2.rule,nvl(t2.from_code,t1.from_code) as from_code,
regexp_replace(t1.rule,t2.index_code,concat(’(’,t2.rule,’)’)) as rule
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

–一次递归:上面语句有逻辑错误,不能再这里替换规则,如果指标A和指标B同时来源于其他指标,分别替换规则之后无法确定最后使用哪个规则
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
)

t1.index_code,t1.rule,nvl(t2.index_code,t1.from_code) as replace_code,
nvl(t2.rule,t1.from_code) as replace_rule,nvl(t2.from_code,t1.from_code) as from_code
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

4.3第一次进行递归,自关联(自关联时考虑null值,关联列为null值怎么办)
with tmp1 as
(
select * from tmp
)
insert overwrite table tmp
select
t1.use_company, --适用公司
t1.index_code, --需加工目标指标编码
t1.rule, --指标加工原始规则
concat_ws(’,’,t1.replace_rule,t2.index_code,t2.rule) as replace_rule, --将改行之前的替换规则和当前的替换规则进行组合格式替换指标,替换规则
nvl(t2.from_code,t1.from_code) as from_code, --来源指标,如果t2为空,说明t1来源指标已经是原始指标,无需替换
‘’ as index_value
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

4.4再次递归
如果select count(*) from tmp; 得到的记录数在增大的话,需要继续递归
with tmp1 as
(
select * from tmp
)
insert overwrite table tmp
select
t1.use_company, --适用公司
t1.index_code, --需加工目标指标编码
t1.rule, --指标加工原始规则
concat_ws(’,’,t1.replace_rule,t2.index_code,t2.rule) as replace_rule, --将改行之前的替换规则和当前的替换规则进行组合格式替换指标,替换规则
nvl(t2.from_code,t1.from_code) as from_code, --来源指标,如果t2为空,说明t1来源指标已经是原始指标,无需替换
‘’ as index_value
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

4.5再次递归查看是否还需要继续递归
with tmp1 as
(
select * from tmp
)
insert overwrite table tmp
select
t1.use_company, --适用公司
t1.index_code, --需加工目标指标编码
t1.rule, --指标加工原始规则
concat_ws(’,’,t1.replace_rule,t2.index_code,t2.rule) as replace_rule, --将改行之前的替换规则和当前的替换规则进行组合格式替换指标,替换规则
nvl(t2.from_code,t1.from_code) as from_code, --来源指标,如果t2为空,说明t1来源指标已经是原始指标,无需替换
‘’ as index_value
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

不会出现错误结果,行数不会再增长

hive 开启列名显示
set hive.cli.print.header=true;
set hive.resultset.use.unique.column.names=false;

–二次循环,二次递归
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
),
tmp2 as (
select t1.index_code,t1.use_company,t1.rule,concat(nvl(t2.index_code,t1.from_code),nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company
)
select t1.index_code,t1.rule,concat(t1.transformer,’,’,nvl(t2.index_code,t1.from_code),nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp2 t1
left join tmp2 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

每次递归将结果写入一个表中,判断该表的count(),如果表的count()还在持续增长说明递归没有结束,
可以设置一个递归最大次数,避免递归层数过多

具体实现放在后面

5、关联指标表获取每个指标的value值
with tmp1 as(
select use_company,index_code,table_alias.from_code,rule from rules
lateral view explode(split(regexp_replace(regexp_replace(rule, ‘[+*/-]’, ‘,’),’[()]’,’’),’,’)) table_alias as from_code
),
tmp2 as (
select t1.index_code,t1.use_company,t1.rule,concat(nvl(t2.index_code,t1.from_code),’,’,nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp1 t1
left join tmp1 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company
)
insert overwrite table result
select t1.index_code,t1.rule,concat(t1.transformer,’,’,nvl(t2.index_code,t1.from_code),’,’,nvl(t2.rule,t1.from_code)) as transformer,
nvl(t2.from_code,t1.from_code) as from_code
from tmp2 t1
left join tmp2 t2 on t1.from_code=t2.index_code and t1.use_company=t2.use_company;

create table result
(
index_code string,
origin_rule string,
replace_rule string,
from_code string
);

select r.*,i.index_value from result r
left join fact i on r.use_company=i.use_company and r.from_index=i.index_code;

模拟关联数据：
select r.*,0 from result r;

创建指标表：
create table index_value_table(
use_company string,
index_code string,
index_value string
)

插入指标数据:
insert into index_value_table
select ‘TPLIFE’,‘TPLIFE_052’,‘1.2’
union all
select ‘TPLIFE’,‘TPLIFE_053’,‘5.7’
union all
select ‘TPLIFE’,‘TPLIFE_051’,‘3.2’
union all
select ‘TPLIFE’,‘TPLIFE_002’,‘7.7’
union all
select ‘TPLIFE’,‘TPLIFE_003’,‘10’
union all
select ‘TPLIFE’,‘TPLIFE_004’,‘9.7’
union all
select ‘TPLIFE’,‘TPLIFE_005’,‘1.2’
union all
select ‘TPLIFE’,‘TPLIFE_030’,‘6.6’
union all
select ‘TPLIFE’,‘TPLIFE_031’,‘5.3’
union all
select ‘TPPRTY’,‘TPPRTY_001’,‘1.2’
union all
select ‘TPPRTY’,‘TPPRTY_002’,‘8.8’
union all
select ‘TPPRTY’,‘TPPRTY_003’,‘7.5’
union all
select ‘TPPRTY’,‘TPPRTY_004’,‘4.4’
union all
select ‘TPPRTY’,‘TPPRTY_005’,‘2.2’

insert overwrite table tmp
select
t1.use_company,
t1.index_code,
t1.rule,
t1.replace_rule,
t1.from_code,
t2.index_value as code_value
from
tmp t1
left join index_value_table t2
on t1.use_company=t2.use_company and t1.from_code=t2.index_code;

6、将多行合并成为一行
use_company index_code rule replace_rule from_code code_value
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_001,TPLIFE_050+TPLIFE_051,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_052 1.2
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_001,TPLIFE_050+TPLIFE_051,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_053 5.7
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_001,TPLIFE_050+TPLIFE_051 TPLIFE_051 3.2
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_002 7.7
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_003 10
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_004 9.7
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 TPLIFE_005 1.2
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_006,TPLIFE_030TPLIFE_031 TPLIFE_030 6.6
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 ,TPLIFE_006,TPLIFE_030TPLIFE_031 TPLIFE_031 5.3
TPLIFE TPLIFE_100 (TPLIFE_001+TPLIFE_002TPLIFE_003)/(TPLIFE_004/TPLIFE_005-TPLIFE_006)0.75 0.75 NULL
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 ,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_052 1.2
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 ,TPLIFE_050,TPLIFE_052+TPLIFE_053 TPLIFE_053 5.7
TPLIFE TPLIFE_001 TPLIFE_050+TPLIFE_051 TPLIFE_051 3.2
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_052 1.2
TPLIFE TPLIFE_050 TPLIFE_052+TPLIFE_053 TPLIFE_053 5.7
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_030 6.6
TPLIFE TPLIFE_006 TPLIFE_030TPLIFE_031 TPLIFE_031 5.3
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002*TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_001 1.2
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_002 8.8
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_003 7.5
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_004 4.4
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_005 2.2
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)0.75 TPPRTY_006 6.5
TPPRTY TPPRTY_100 (TPPRTY_001+TPPRTY_002TPPRTY_003)/(TPPRTY_004/TPPRTY_005-TPPRTY_006)*0.75 0.75 NULL

根据需要加工的index_code以及use_company进行分组

select
use_company,index_code,rule,concat_ws(’,’,collect_set(case when replace_rule<>’’ then replace_rule else null end)) as replace_rule,
concat_ws(’,’,collect_list(case when code_value is null and from_code rlike ‘^\w’ then ‘0’ else code_value end)) as index_value
from tmp
group by use_company,index_code,rule;

collect_list 不去重
collect_set 去重

7、通过自定义函数UDF进行指标加工
calculate(rule,replace_rule,index_value)

select
t.use_company,
t.index_code,
formula_parser(t.rule,nvl(t.replace_rule,’’),t.index_value) as index_value
from
(
select
use_company,index_code,rule,concat_ws(’,’,collect_set(case when replace_rule<>’’ then replace_rule else null end)) as replace_rule,
concat_ws(’,’,collect_list(case when code_value is null and from_code like ‘T%’ then ‘0’ else code_value end)) as index_value
from tmp
group by use_company,index_code,rule
) t;

java内置的运算类：
String str = “43*(2 + 1.4)+232/0+9/(11*0)”;
ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName(“js”);
Object result = engine.eval(str);
System.out.println(“结果类型:” + result.getClass().getName() + “,计算结果:” + result);

但是不符合要求,不好处理除数为0的情况

已完成自定义四则运算对象

8、如何保证value的顺序和formula的顺序是一样的？
用map?
hive中的map数据结构
{“name”:“zhangsan”,“age”:“25”,“addr”:“bj”}
不用,collect_list的时候顺序和公式里面的顺序是一样的。

9、HIVE UDF函数写法
9.1新建java工程，导入hive相关包，导入hive相关的lib 创建一个新的maven module
配置maven依赖：

org.apache.hive
hive-exec
1.2.0

org.apache.hive
hive-common
1.2.0

定义一个类
cn.ruanwenfu.hive.Repeat;
实现其中的evaluate方法即可

package cn.tedu.hive;

import org.apache.hadoop.hive.ql.exec.UDF;

// abc 5 -> abcabcabcabcabc
public class Repeat extends UDF {

// Hive自动去寻找这个evaluate函数
// 根据场景要求决定这个函数的参数和返回值类型
public String evaluate(String str, int n) {
    if (n < 1)
        throw new IllegalArgumentException("个数至少为1！！！");
    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < n; i++) {
        sb.append(str);
    }
    return sb.toString();
}

}

如何打包：
1、Project Structure
2、Artifacts 选择加号选择JAR 选择From modules with dependencies
进入create jar from modules
Module选择刚才写的module
其他地方不用选,没有Main Class,选择extract to the target JAR
选择OK
选择APPLY

选择Build
Build Artifacts
选择刚才的module名.jar
选择Build

在out目录下出现了对应的jar包

上传至linux,随便上传一个目录

为了保证上传的jar包中的MATE-INF目录下的文件校验通过,在linux窗口执行zip -d 02_hive_自定义函数udf.jar ‘META-INF/.SF’ ‘META-INF/.RSA’ ‘META-INF/*SF’

在Hive中：
添加jar包：add jar /home/software/02_hive_自定义函数udf.jar;
创建临时函数，给函数起名，并且绑定这个函数对应的类：create temporary function repeatstring as ‘cn.tedu.hive.Repeat’;
调用函数：select repeatstring(“abc”, 5);

create temporary function formula_parser as ‘UDFCalculate’

9.2创建类继承UDF
9.3自己编写一个evaluate方法，返回值和参数任意
import org.apache.hadoop.hive.ql.exec.UDF;
public class ToUpper extends UDF{
public String evaluate(String str){ //接收的参数为1个string,返回值也是一个String
return str.toUpperCase();
}
}

9.4为了能让mapreduce处理，String要用Text处理。(这个是啥意思)

9.5 将写好的类打成jar包，上传到linux中
在hive命令行下，向hive注册UDF：add jar /xxxx/xxxx.jar
例如add jar /root/work/data/udf.jar

9.6在hive命令行下，为当前udf起一个名字：create temporary function fname as ‘类的全路径名’;
create temporary function fname as ‘类的全路径名’;

create temporary function mytoup as ‘cn.ruanwenfu.udf.ToUpper’;

9.7直接在hsql中使用该函数
select mytoup(‘dsafjdksafjl’);

qq_34352013

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
hive实现自定义公式解析

hive实现自定义公式解析import org.apache.hadoop.hive.ql.exec.UDF;import javax.script.ScriptEngine;import javax.script.ScriptEngineManager;import javax.script.ScriptException;import java.util.Stack;import java.util.regex.Matcher;import java.util.regex.Pattern
复制链接

扫一扫