hive函数

最新推荐文章于 2024-07-09 23:16:04 发布

别再摆烂了！

最新推荐文章于 2024-07-09 23:16:04 发布

阅读量874

点赞数 20

文章标签： hive hadoop 数据仓库

本文链接：https://blog.csdn.net/m0_63606851/article/details/135566525

版权

一.单行函数

特点是一进一出，即输入一行，输出一行

(1)算术运算函数

select sal + 1 from emp;

(2)数值函数

-- round：四舍五入
select round(3.3);   3

-- ceil：向上取整
select ceil(3.1) ;   4

-- floor：向下取整
select floor(4.8);  4

(3)字符串函数

1）substring：截取字符串

substring(string A, int start, int len) --字符串，起始位置，截取长度

select substring("atguigu",3,2); --gu

select substring("atguigu",-3);  --igu

2）replace ：替换

replace(string A, string B, string C)

select replace('atguigu', 'a', 'A')  --Atguigu

3) regexp_replace：正则替换

regexp_replace(string A, string B, string C)

说明：将字符串A中的符合java正则表达式B的部分替换为C。注意，在有些情况下要使用转义字符。

select regexp_replace("123-3221", "\\d+", "cb");

4）regexp：正则匹配

说明：若字符串符合正则表达式，则返回true，否则返回false。

select 'dfssss' regexp 'dfs+'; --true

select 'dfssss' regexp 'dfg+'; --false

5）repeat：重复字符串

select repeat('woaini', 5);

6）split ：字符串切割

说明：按照正则表达式pat匹配到的内容分割str，分割后的字符串，以数组的形式返回。

select split('a-b-c-d','-');
--["a","b","c","d"]

7）nvl ：替换null值

说明：若A的值不为null，则返回A，否则返回B。

select nvl(null,1);  --1

select nvl(520,1); --520

8）concat ：拼接字符串

concat(string A, string B, string C, ……)

说明：将A,B,C……等字符拼接为一个字符串

select concat('beijing','-','shanghai','-','shenzhen');
-- bejing-sahnghai-shenzhen

或者使用||

select 'bj' || 'sh' || 'sz';

9）concat_ws：以指定分隔符拼接字符串或者字符串数组

select concat_ws('_','bj','hht','sd');
select concat_ws('_',`array`('bj','tj','cd'));

10）get_json_object：解析json字符串

select get_json_object('[{"name":"大海海","sex":"男","age":"25"},{"name":"小宋宋","sex":"男","age":"47"}]','$.[0]');
-- 使用$[i]获取数组中的对象

select get_json_object('[{"name":"大海海","sex":"男","age":"25"},{"name":"小宋宋","sex":"男","age":"47"}]','$.[0].sex');
-- 获取想要的元素

二.日期函数

三、流程控制函数

1）case when：条件判断函数类比成switch

-- 给score_info表中的分数评级
select sc.stu_id,
    case
        when sc.score >=90 then 'A'
        when sc.score >=80 and sc.score < 90 then 'B'
        else 'C'
        end
from score_info sc;

2）if: 条件判断，类似于Java中三元运算符

select if(1 > 0, 't', 'f');

四、集合函数

(1).array

select array('1','2','3','4');
--[1,2,3,4]

select array_contains(array('a','b','c','d'),'a');
-- ture

select sort_array(array('a','d','c'));
-- [a,c,d]

select size(array(1,2,3,4))
-- 4

(2).map

select map('xiaohai',1,'dahai',2); 
-- {"xiaohai":1,"dahai":2}


select map_keys(map('xiaohai',1,'dahai',2));
-- ["xiaohai","dahai"]


select map_values(map('xiaohai',1,'dahai',2));
-- [1,2]

(3).struct

select struct('name','age','weight');
-- {"col1":"name","col2":"age","col3":"weight"}


select named_struct('name','xiaosong','age',18,'weight',80);
-- {"name":"xiaosong","age":18,"weight":80}

案例

create  table  employee(
    name string,  --姓名
    sex  string,  --性别
    birthday string, --出生年月
    hiredate string, --入职日期
    job string,   --岗位
    salary double, --薪资
    bonus double,  --奖金
    friends array<string>, --朋友
    children map<string,int> --孩子
);

insert into employee
  values('张无忌','男','1980/02/12','2022/08/09','销售',3000,12000,array('阿朱','小昭'),map('张小无',8,'张小忌',9)),
        ('赵敏','女','1982/05/18','2022/09/10','行政',9000,2000,array('阿三','阿四'),map('赵小敏',8)),
        ('宋青书','男','1981/03/15','2022/04/09','研发',18000,1000,array('王五','赵六'),map('宋小青',7,'宋小书',5)),
        ('周芷若','女','1981/03/17','2022/04/10','研发',18000,1000,array('王五','赵六'),map('宋小青',7,'宋小书',5)),
        ('郭靖','男','1985/03/11','2022/07/19','销售',2000,13000,array('南帝','北丐'),map('郭芙',5,'郭襄',4)),
        ('黄蓉','女','1982/12/13','2022/06/11','行政',12000,null,array('东邪','西毒'),map('郭芙',5,'郭襄',4)),
        ('杨过','男','1988/01/30','2022/08/13','前台',5000,null,array('郭靖','黄蓉'),map('杨小过',2)),
        ('小龙女','女','1985/02/12','2022/09/24','前台',6000,null,array('张三','李四'),map('杨小过',2))

--统计每个月的入职人数
select month(replace(hiredate,'/','-')) month,
       count(e.hiredate) count
from employee e
group by month(replace(hiredate,'/','-'));

--查询每个人的年龄（年 + 月）
select e.name name,
       concat(year(`current_date`()) - year(replace(e.birthday,'/','-')),'.',
              `if`(month(`current_date`()) - month(replace(e.birthday,'/','-')) >=0,
                  month(`current_date`()) - month(replace(e.birthday,'/','-')),
                  month(replace(e.birthday,'/','-')) - month(`current_date`())))
from employee e ;


-- 按照薪资，奖金的和进行倒序排序，如果奖金为null，置位0
select e.name name,
       (`if`(e.bonus is null,0,e.bonus) + e.salary) total
from employee e
order by total desc ;
-- 错误的
select e.name name,
       sum(`if`(e.bonus is null,0,e.bonus),e.salary) total -- sum只是求一个标签的和的，不能这么用
from employee e
order by total desc ;


--查询每个人有多少个朋友
select e.name name,
       size(e.friends) num_fri
from employee e ;

--查询每个人的孩子的姓名
select e.name name,
       map_keys(e.children) chi_name
from employee e ;


--查询每个岗位男女各多少人
select e.job job,
       sum(`if`(e.sex = '男',1,0)) male,   --不能用count
       sum(`if`(e.sex = '男',1,0)) female
from employee e
group by e.job;


--每个月的入职人数以及姓名
select month(replace(e.hiredate,'/','-')) month,
       count(e.name) cnt,
        Collect_list(name) as name_list
from employee e
group by month(replace(e.hiredate,'/','-'));

五、高级聚合函数

collect_list 收集并形成list集合，结果不去重

select collect_list(job)
from employee e
;

--["销售","行政","研发","研发","销售","行政","前台","前台"]

collect_set 收集并形成set集合，结果去重

select collect_set(job)
from employee e;
--["销售","行政","研发","前台"]

六、炸裂函数（UDTF）

create table movie_info(
    movie string,     --电影名称
    category string   --电影分类
) 
row format delimited fields terminated by "\t";

insert overwrite table movie_info
values ("《疑犯追踪》", "悬疑,动作,科幻,剧情"),
       ("《Lie to me》", "悬疑,警匪,动作,心理,剧情"),
       ("《战狼2》", "战争,动作,灾难");


select
    cate,
    count(*)
from
(
    select
        movie,
        cate
    from
    (
        select
            movie,
            split(category,',') cates
        from movie_info
    )t1 lateral view explode(cates) tmp as cate
)t2
group by cate;

七、窗口函数

1)基于行的窗口：

必须先排序

2)基于值的窗口

3)基于分区

4)缺省

4)常用的窗口函数

(1）聚合函数

max：最大值。

min：最小值。

sum：求和。

avg：平均值。

count：计数。

(2）跨行取值函数

（1）lead和lag

把这个要看懂

这个里边窗口不允许自定义

(2)first_value和last_value

这个窗口允许自定义

(3)排名函数

不支持自定义窗口

案例演示：

create table order_info
(
    order_id     string, --订单id
    user_id      string, -- 用户id
    user_name    string, -- 用户姓名
    order_date   string, -- 下单日期
    order_amount int     -- 订单金额
);
insert overwrite table order_info
values ('1', '1001', '小元', '2022-01-01', '10'),
       ('2', '1002', '小海', '2022-01-02', '15'),
       ('3', '1001', '小元', '2022-02-03', '23'),
       ('4', '1002', '小海', '2022-01-04', '29'),
       ('5', '1001', '小元', '2022-01-05', '46'),
       ('6', '1001', '小元', '2022-04-06', '42'),
       ('7', '1002', '小海', '2022-01-07', '50'),
       ('8', '1001', '小元', '2022-01-08', '50'),
       ('9', '1003', '小辉', '2022-04-08', '62'),
       ('10', '1003', '小辉', '2022-04-09', '62'),
       ('11', '1004', '小猛', '2022-05-10', '12'),
       ('12', '1003', '小辉', '2022-04-11', '75'),
       ('13', '1004', '小猛', '2022-06-12', '80'),
       ('14', '1003', '小辉', '2022-04-13', '94');

-- 统计每个用户截至每次下单的累积下单总额
select *,
       sum(order_amount) over(partition by user_id order by order_date rows between unbounded preceding and current row ) total
from order_info od;


-- 统计每个用户截至每次下单的当月累积下单总额
select *,
       sum(order_amount) over(partition by user_id,substr(order_date,1,7) order by order_date rows between unbounded preceding and current row ) total
from order_info od;

--统计每个用户每次下单距离上次下单相隔的天数（首次下单按0天算）
select order_id, user_id, user_name, order_date, order_amount, nvl(datediff(order_date,last_date),0) diff
from
(select *,
       lag(order_date,1,0) over (partition by user_id order by order_date) last_date
from order_info od) t1;


-- 查询所有下单记录以及每个用户的每个下单记录所在月份的首/末次下单日期
select *,
       first_value(order_date,false) over (partition by user_id,substr(order_date,1,7) order by order_date) first_date,
       last_value(order_date,false) over (partition by user_id,substr(order_date,1,7) order by order_date
           rows between unbounded preceding and unbounded following) last_date --必须加between and
from order_info od;

-- 为每个用户的所有下单记录按照订单金额进行排名
select
    order_id,
    user_id,
    user_name,
    order_date,
    order_amount,
    rank() over(partition by user_id order by order_amount desc) rk,
    dense_rank() over(partition by user_id order by order_amount desc) drk,
    row_number() over(partition by user_id order by order_amount desc) rn
from order_info;

八、自定义函数

1）创建一个Maven工程Hive

2）导入依赖

<groupId>org.apache.hive</groupId>

</dependency>

</dependencies>

3）创建一个类

package com.atguigu.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDFArgumentException;

import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;

import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;

import org.apache.hadoop.hive.ql.metadata.HiveException;

import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;

/**

* 我们需计算一个要给定基本数据类型的长度

public class MyUDF extends GenericUDF {

/**

* 判断传进来的参数的类型和长度

* 约定返回的数据类型

@Override

public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {

if (arguments.length !=1) {

throw new UDFArgumentLengthException("please give me only one arg");

}

if (!arguments[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)){

throw new UDFArgumentTypeException(1, "i need primitive type arg");

}

return PrimitiveObjectInspectorFactory.javaIntObjectInspector;

}

/**

* 解决具体逻辑的

@Override

public Object evaluate(DeferredObject[] arguments) throws HiveException {

Object o = arguments[0].get();

if(o==null){

return 0;

}

return o.toString().length();

}

@Override

// 用于获取解释的字符串

public String getDisplayString(String[] children) {

return "";

}

4）创建临时函数

（1）打成jar包上传到服务器/opt/module/hive/datas/myudf.jar

（2）将jar包添加到hive的classpath，临时生效

hive (default)> add jar /opt/module/hive/datas/myudf.jar;

（3）创建临时函数与开发好的java class关联

hive (default)>

create temporary function my_len

as "com.atguigu.hive.udf.MyUDF";

（4）即可在hql中使用自定义的临时函数

hive (default)>

select

ename,

my_len(ename) ename_len

from emp;

（5）删除临时函数

hive (default)> drop temporary function my_len;

别再摆烂了！

关注

20
点赞
踩
23

收藏

觉得还不错? 一键收藏
1
评论
hive函数

1创建一个Maven工程Hive2）导入依赖3创建一个类/*** 我们需计算一个要给定基本数据类型的长度*/GenericUDF/*** 判断传进来的参数的类型和长度* 约定返回的数据类型*/@Override=1) {if (!/*** 解决具体逻辑的*/@Overridereturn 0;@Override// 用于获取解释的字符串return "";4创建临时函数。
复制链接

扫一扫