CREATETABLE learn3.wordcount(
word STRING COMMENT"单词")ROW FORMAT DELIMITED FIELDSTERMINATEDBY'\t'
STORED AS TEXTFILE;INSERTINTOTABLE learn3.wordcount (word)VALUES("hello,word"),("hello,java"),("hive,hello");-- 统计WORDCOUNTSELECT
T1.col
,sum(num)FROM(SELECT
col2 as col,count(*)as num
FROM(SELECT
split(word,",")[0]as col1,
split(word,",")[1]as col2
FROM learn3.wordcount
)T GROUPBY T.col2
UNIONALLSELECT
col1 as col,count(*)as num
FROM(SELECT
split(word,",")[0]as col1,
split(word,",")[1]as col2
FROM learn3.wordcount
)T GROUPBY T.col1
)T1 GROUPBY T1.col
+---------+------+| t1.col | _c1 |+---------+------+| hello |6|| hive |2|| java |2|| word |2|+---------+------+
三、WITH AS 用法
格式:
WITH table1 AS(SELECT 查询语句1),table2 AS(SELECT 查询语句2)[INSERTINTOTABLE]SELECTFROM
WORDCOUNT WITHAS 用法
WITH split_res AS(SELECT
split(word,",")[0]as col1,
split(word,",")[1]as col2
FROM learn3.wordcount
),col_count AS(SELECT
col1 as col
,count(*)as num
FROM split_res
GROUPBY col1
),col2_count AS(SELECT
col2 as col
,count(*)as num
FROM split_res
GROUPBY col2
)SELECT
T.col,sum(T.num)FROM(SELECT*FROM col_count
UNIONALLSELECT*FROM col2_count
)T GROUPBY T.col
五、集合函数
COLLECT_LIST(column2)
需要跟GROUPBY column1 配合使用 将column1中相同的组column2数据放至一个集合
COLLECT_SET()
需要跟GROUPBY column1 配合使用 将column1中相同的组column2数据放至一个集合中 并对集合中的数据进行去重操作
需求:
将一列中相同的内容放至一组数据中
将 word列中所有相同的单词对应的num 放至一个数组中
CREATETABLE learn3.wordcount2(
word STRING COMMENT"单词",
num intCOMMENT"数量")
STORED AS TEXTFILE;INSERTINTOTABLE learn3.wordcount2 (word,num)VALUES("hello",1),("hello",2),("hive",3);SELECT
word
,COLLECT_LIST(num)FROM learn3.wordcount2
GROUPBY word;+--------+------------+| word | _c1 |+--------+------------+| hello |[1,2,1,2]|| hive |[3,3]|+--------+------------+SELECT
word
,COLLECT_SET(num)FROM learn3.wordcount2
GROUPBY word;+--------+--------+| word | _c1 |+--------+--------+| hello |[1,2]|| hive |[3]|+--------+--------+CREATE EXTERNAL TABLEIFNOTEXISTS learn3.student20(
id STRING COMMENT"学生ID",name STRING COMMENT"学生姓名",age intCOMMENT"年龄",gender STRING COMMENT"性别",subject STRING COMMENT"学科")ROW FORMAT DELIMITED FIELDSTERMINATEDBY","
STORED AS TEXTFILE;loaddatalocal inpath "/usr/local/soft/hive-3.1.2/data/student_20.txt"INTOTABLE learn3.student20;
将两列数据进行拼接
WITH concat_stu AS(SELECT
CONCAT(subject,"|",gender)as subject_gender
,CONCAT_WS("|",id,name)as id_name
FROM learn3.student20
)SELECT
subject_gender
,CONCAT_WS(",",COLLECT_LIST(id_name))FROM concat_stu
GROUPBY subject_gender;+-----------------+----------------------------------------------------+| subject_gender | _c1 |+-----------------+----------------------------------------------------+| 文科|女 |1500100001|施笑槐,1500100007|尚孤风,1500100016|潘访烟,1500100018|骆怜雪 || 文科|男 |1500100002|吕金鹏,1500100013|逯君昊 || 理科|女 |1500100003|单乐蕊,1500100005|宣谷芹,1500100008|符半双,1500100012|梁易槐,1500100015|宦怀绿,1500100017|高芷天 || 理科|男 |1500100004|葛德曜,1500100006|边昂雄,1500100009|沈德昌,1500100010|羿彦昌,1500100011|宰运华,1500100014|羿旭炎,1500100019|娄曦之,1500100020|杭振凯 |+-----------------+----------------------------------------------------+
CREATE EXTERNAL TABLEIFNOTEXISTS learn3.student1(
id STRING COMMENT"学生ID",
name STRING COMMENT"学生姓名",
age intCOMMENT"年龄",
gender STRING COMMENT"性别",
clazz STRING COMMENT"班级")ROW FORMAT DELIMITED FIELDSTERMINATEDBY",";loaddatalocal inpath "/usr/local/soft/hive-3.1.2/data/students.txt"INTOTABLE learn3.student1;CREATE EXTERNAL TABLEIFNOTEXISTS learn3.score1(
id STRING COMMENT"学生ID",
subject_id STRING COMMENT"科目ID",
score intCOMMENT"成绩")ROW FORMAT DELIMITED FIELDSTERMINATEDBY","loaddatalocal inpath "/usr/local/soft/hive-3.1.2/data/score.txt"INTOTABLE learn3.score1;
需求:
1.统计各性别年龄前三
-- HIVE 原先老版本 不支持这种写法 SELECT
s1.*FROM learn3.student1 as s1
WHERE3>(SELECTcount(*)FROM learn3.student1 as s2
WHERE s1.gender = s2.gender and s1.age < s2.age
)2.统计各班级学生总成绩前三名
①算出学生总成绩
WITH score_sum AS(SELECT
id
,sum(score)as total_score
FROM learn3.score1
GROUPBY id
), studen_score AS(SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id
)--② 排序获取前三名SELECT
TT.*FROM(SELECT
T.name
,T.total_score
,T.clazz
, ROW_NUMBER()OVER(PARTITIONBY T.clazz ORDERBY T.total_score DESC)as row_pm
FROM studen_score T
) TT
WHERE TT.row_pm <=3SELECT
T.name
,T.total_score
,T.clazz
, ROW_NUMBER()OVER(PARTITIONBY T.clazz ORDERBY T.total_score DESC)as row_pm
, dense_rank()OVER(PARTITIONBY T.clazz ORDERBY T.total_score DESC)as dense_pm
, rank()OVER(PARTITIONBY T.clazz ORDERBY T.total_score DESC)as rank_pm
, percent_rank()OVER(PARTITIONBY T.clazz ORDERBY T.total_score DESC)as percent_rank_pm
, cume_dist()OVER(PARTITIONBY T.clazz ORDERBY T.total_score DESC)as cume_dist_pm
FROM studen_score T
2、取每个班级总分最大的同学
##### row_number:无并列排名* 用法: select xxxx, row_number()over(partitionby 分组字段 orderby 排序字段 desc)as rn from tb groupby xxxx
##### dense_rank:有并列排名,并且依次递增* 用法: select xxxx, dense_rank()over(partitionby 分组字段 orderby 排序字段 desc)as rn from tb groupby xxxx
##### rank:有并列排名,不依次递增* 用法: select xxxx, rank()over(partitionby 分区字段 orderby 排序字段 desc)as rn from tb groupby xxxx
##### percent_rank:(rank的结果-1)/(分区内数据的个数-1)* 用法: select xxxx, percent_rank()over(partitionby 分组字段 orderby 排序字段 desc)as rn from tb groupby xxxx
需求1:
取每个班级总分最大的同学
CREATETABLE learn3.student_score(
name STRING COMMENT"",
total_score intCOMMENT"",
clazz STRING COMMENT"");WITH score_sum AS(SELECT
id
,sum(score)as total_score
FROM learn3.score1
GROUPBY id
)INSERTINTOTABLE learn3.student_score
SELECT
T1.name
,T2.total_score
,T1.clazz
FROM learn3.student1 T1
JOIN score_sum T2 ON T1.id = T2.id
-- 通过max方法取出每个班级分区中的学生成绩最大值 SELECT
TT.*FROM(SELECT
T1.*,max(T1.total_score)OVER(PARTITIONBY T1.clazz ORDERBY T1.total_score DESC)as max_score
FROM learn3.student_score T1
) TT WHERE TT.total_score = TT.max_score
-- 如果max() OVER() 中的窗口over()没有给定分区,那么当前的窗口表示整个学校,得到的数据是整个学校的最高的分数SELECT
TT.*FROM(SELECT
T1.*,max(T1.total_score)OVER()as max_score
FROM learn3.student_score T1
) TT WHERE TT.total_score = TT.max_score
max
用法:
① max(T1.total_score)OVER(PARTITIONBY T1.clazz ORDERBY T1.total_score DESC)
基于每个partition分区内数据取最大值
② max(T1.total_score)OVER()
基于整个数据集取最大值
min、avg、count、sum:与max方法使用一致
八、卡口流量需求分析
1、查询当月的设备及其总数
CREATETABLE learn3.veh_pass(
id STRING COMMENT"卡口编号",
pass_time STRING COMMENT"进过时间",
pass_num intCOMMENT"过车数")ROW FORMAT DELIMITED FIELDSTERMINATEDBY","
STORED AS TEXTFILE;loaddatalocal inpath "/usr/local/soft/hive-3.1.2/data/veh_pass.txt"INTOTABLE learn3.veh_pass;
需求1:查询当月的设备及其总数
-- 写法1SELECT
T.id
,count(*)OVER()FROM(SELECT
id
,pass_time
FROM learn3.veh_pass
WHERE substr(pass_time,1,7)= substr(current_date,1,7)) T GROUPBY T.id
-- 错误写法SELECTDISTINCT id
,count(*)OVER()FROM(SELECT
id
,pass_time
FROM learn3.veh_pass
WHERE substr(pass_time,1,7)= substr(current_date,1,7))T
-- 写法2:SELECT
T1.id
,count(*)OVER()FROM(SELECTDISTINCT id
FROM(SELECT
id
,pass_time
FROM learn3.veh_pass
WHERE substr(pass_time,1,7)= substr(current_date,1,7))T )T1
+---------------------+-----------------+| t1.id | count_window_0 |+---------------------+-----------------+|451000000000071117|5||451000000000071116|5||451000000000071115|5||451000000000071114|5||451000000000071113|5|+---------------------+-----------------++---------------------+| id |+---------------------+|451000000000071113||451000000000071114||451000000000071115||451000000000071116||451000000000071117|+---------------------+