日期函数
- 截取字符串:substr()
- 日期函数:
获取当前日期:current_date
获取当前日期时间:current_timestamp
获取当前时间戳:unix_timestamp()
将当前的时间转换为想要的格式:from_unixtime()
json函数解析
以电影热度rating.json为例;
//hive中先创建一张表,将一行的json看做一个字段
create table t_rate_json(line string) row format delimited;
//添加数据
load data local inpath '/home/rating.json' into table t_rate_json;
//创建一张表,用来解析数据
create table t_rate(movie string,rate int,ts string,uid string) row format delimited fields terminated by '\001';
//解析json函数用get_json_object函数
insert into table t_rate
> select get_json_object(line,'$.movie'),
> get_json_object(line,'$.rate'),
> get_json_object(line,'$.timeStamp'),
> get_json_object(line,'$.uid')
> from t_rate_json;
//查看前十条
select * from t_rate limit 10;
//统计评分大于3的所有评分记录
select * from t_rate where rate > 3;
//统计每个人评分的总分数
select uid,sum(rate) from t_rate group by uid; //group by 按...分组
//统计每个人评分的总分数倒序排,前10个;
select uid,sum(rate) rate_sum from t_rate group by uid order by rate_sum desc limit 10; //order by:排序 desc:jian
另一种解析:
select
> json_tuple(line,"movie","rate","timeStamp","uid")
> as(movie,rate,ts,uid)
> from t_rate_json
> limit 10;
网址解析 parse_url_tuple()
select parse_url_tuple("http://www.baidu.com/findcookieid=4234234234",'HOST','PATH', 'QUERY','QUERY:cookieid')
from dual;
explode和lateral view
- explode:可以将表中一个数组变成列
vi student.txt
创建表:
create table t_xuanxiu(uid string,name string,kc array<string>) //kc为一个数组
> row format delimited
> fields terminated by ','
> collection items terminated by ':';
加载数据:load data local inpath '/home/student.txt' into table t_xuanxiu;
用explode查询课程:select explode(kc) from t_xuanxiu where uid=1;
select uid,name,explode(kc) from t_xuanxiu where uid=1;
uid和拆分的array元素是对应的.我们应该如何进行连接呢?我们知道直接select id,explode()是不行的,会报如下错误:
这是需要用到later view。
- lateral view :表生成函数
lateral view为侧视图,意义是为了配合UDTF来使用,把某一行数据拆分成多行数据.不加lateral view的UDTF只能提取单个字段拆分,并不能塞会原来数据表中.加上lateral view就可以将拆分的单个字段数据与原始表数据关联上。
在使用lateral view的时候需要指定视图别名和生成的新列别名。
select uid,name,tmp.course from t_xuanxiu
> lateral view explode(kc) tmp as course;
//lateral view 将 explode(kc) 看成一个表是 tmp 就一个字段as course;
https://blog.csdn.net/weixin_39043567/article/details/90636497
例:利用hive做wordcount.
vi words.txt
建表:
create table wordcount(line string)
> row format delimited;
加载数据:
load data local inpath '/home/words.txt' into table wordcount;
select a.word,count(1) sum //获取单词并计数为1,并求和
> from
> (select tmp.* from wordcount lateral view explode(split(line,' ')) tmp as word) a
//子查询:将打散的结果看成一个表并重命名为a
> group by a.word //以单词分组
> order by sum desc; //根据单词的数量倒序排序
rownumber()和over()
测试:求每个人前两高的分数:
vi score.txt
建表:
create table score(name string,kcid string,score int)
> row format delimited
> fields terminated by ',';
加载数据:
load data local inpath '/home/score.txt' into table score;
select *,row_number() over(partition by name order by score desc) from score;
//添加行号,以名字分组,以分数倒序排序
注意:在使用 row_number() over()函数时候,over()里头的分组以及排序的执行晚于 where 、group by、 order by 的执行。