Hive SQL 代码snippet
with balances as (
select '2018-01-01' as date, balance
from unnest([1,2,3,4,5]) as balance -- Gini coef: 0.2666666666666667
union all
select '2018-01-02' as date, balance
from unnest([3,3,3,3]) as balance -- Gini coef: 0.0
union all
select '2018-01-03' as date, balance
from unnest([4,5,1,8,6,45,67,1,4,11]) as balance -- Gini coef: 0.625
),
ranked_balances as (
select date, balance, row_number() over (partition by date order by balance desc) as rank
from balances
)
select date,
-- (1 − 2B) https://en.wikipedia.org/wiki/Gini_coefficient
1 - 2 * sum((balance * (rank - 1) + balance / 2)) / count(*) / sum(balance) AS gini
from ranked_balances
group by date
having sum(balance) > 0
order by date asc
-- verify here http://shlegeris.com/gini
create temporary macro wilson_score (m bigint, n bigint)
(1.0*m/n + 1/(2*n)*pow(1.96, 2) - 1.96*sqrt( 1.0*m/n*(1-1.0*m/n)/n + pow(1.96, 2)/(4*n*n))) / (1 + pow(1.96, 2)/n);
---- auc
---- https://vimsky.com/article/3879.html
select (ry - 0.5*ny*(ny+1))/(nx*ny) as auc
from(
select sum(if(pay_flag=1, rank, 0)) as ry,
sum(pay_flag) as ny,
sum(1-pay_flag) as nx
from(
-- rank means order of features
-- pay_flag means y label
) tb
) tab;
----- 使用第三方UDF函数
set hive.strict.checks.large.query=false;
set hive.mapred.mode=nonstrict;
with data as (
select 0.5 as prob, 0 as label
union all
select 0.3 as prob, 1 as label
union all
select 0.2 as prob, 0 as label
union all
select 0.8 as prob, 1 as label
union all
select 0.7 as prob, 1 as label
)
select
hivemall.auc(prob, label) as auc
from (
select prob, label
from data
ORDER BY prob DESC
) t;