Hive SQL 代码snippet

最新推荐文章于 2023-06-16 14:52:19 发布

wgdzz

最新推荐文章于 2023-06-16 14:52:19 发布

阅读量333

点赞数

分类专栏： hive 机器学习编程

本文链接：https://blog.csdn.net/wgdzz/article/details/108295397

版权

编程同时被 3 个专栏收录

19 篇文章 0 订阅

订阅专栏

机器学习

6 篇文章 0 订阅

订阅专栏

hive

4 篇文章 0 订阅

订阅专栏

Hive SQL 代码snippet

使用SQL计算gini指数

with balances as (
    select '2018-01-01' as date, balance
    from unnest([1,2,3,4,5]) as balance -- Gini coef: 0.2666666666666667
    union all
    select '2018-01-02' as date, balance
    from unnest([3,3,3,3]) as balance -- Gini coef: 0.0
    union all
    select '2018-01-03' as date, balance
    from unnest([4,5,1,8,6,45,67,1,4,11]) as balance -- Gini coef: 0.625
),
ranked_balances as (
    select date, balance, row_number() over (partition by date order by balance desc) as rank
    from balances
)
select date, 
    -- (1 − 2B) https://en.wikipedia.org/wiki/Gini_coefficient
    1 - 2 * sum((balance * (rank - 1) + balance / 2)) / count(*) / sum(balance) AS gini
from ranked_balances
group by date
having sum(balance) > 0
order by date asc
-- verify here http://shlegeris.com/gini

使用SQL计算威尔逊下界

create temporary macro wilson_score (m bigint, n bigint)
(1.0*m/n + 1/(2*n)*pow(1.96, 2) - 1.96*sqrt( 1.0*m/n*(1-1.0*m/n)/n + pow(1.96, 2)/(4*n*n))) / (1 + pow(1.96, 2)/n);

使用SQL计算auc

---- auc
---- https://vimsky.com/article/3879.html
select (ry - 0.5*ny*(ny+1))/(nx*ny) as auc 
from(
    select sum(if(pay_flag=1, rank, 0)) as ry,
           sum(pay_flag) as ny,
           sum(1-pay_flag) as nx 
    from(
        -- rank means order of features
        -- pay_flag means y label
    ) tb
) tab;


----- 使用第三方UDF函数
set hive.strict.checks.large.query=false; 
set hive.mapred.mode=nonstrict;

with data as (
  select 0.5 as prob, 0 as label
  union all
  select 0.3 as prob, 1 as label
  union all
  select 0.2 as prob, 0 as label
  union all
  select 0.8 as prob, 1 as label
  union all
  select 0.7 as prob, 1 as label
)
select
  hivemall.auc(prob, label) as auc
from (
  select prob, label
  from data
  ORDER BY prob DESC
) t;