hive中hql语法总结

最新推荐文章于 2023-05-03 13:04:17 发布

kane0409

最新推荐文章于 2023-05-03 13:04:17 发布

阅读量1.4k

点赞数

分类专栏： hive 文章标签： hive

本文链接：https://blog.csdn.net/kane0409/article/details/89520361

版权

hive 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

1.建库语句：
   CREATE DATABASE [IF NOT EXISTS] database_name
   [COMMENT database_comment]
   [LOCATION hdfs_path]
   [WITH DBPROPERTIES (property_name=property_value, ...)];
       例：
           create DATABASE if NOT EXISTS hive_db2
           comment "my first database"
           location "/hive_db2"
2.库的修改：
   alter database hive_db2 set DBPROPERTIES ("createtime"="2018-12-19");
3.库的删除
   drop database db_hive cascade if exists; -- 删除存在表的数据库
3.建表语句：
   CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
   [(col_name data_type [COMMENT col_comment], ...)]
   [COMMENT table_comment]
   [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
   [CLUSTERED BY (col_name, col_name, ...)
   [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
   [ROW FORMAT row_format]
   DELIMITED [FIELDS TERMINATED BY char] [COLLECTION ITEMS TERMINATED BY char]
           [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
   | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
   [STORED AS file_format] Textfile
   [LOCATION hdfs_path]
   [TBLPROPERTIES (property_name=property_value, ...)]
   [AS select_statement]
       例1：
           create table student2(id int COMMENT "xuehao", name string COMMENT "mingzi")
           COMMENT "xueshengbiao"
           ROW format delimited
           fields terminated by '\t'
           STORED as Textfile
           location '/student2' -- 直接加载该目录下的数据文件到表中
           TBLPROPERTIES ("createtime"="2018-12-18");
       例2：
           create table student(id int, name string)
           row format delimited
           fields terminated by '\t';
           load data local inpath '/opt/module/datas/student.txt' into table student;
       例3：
           create table student4 like student2; -- 仅复制表结构
4.导入数据语句
   4.1 不加local则导入hdfs上文件，但会剪贴原文件，local本地仅粘贴
           load data [local] inpath '/opt/module/datas/student.txt'
           [overwrite] into table student [partition (partcol1=val1,…)];
   4.2 创建表并导入数据（依据以存在的表）
           create table student6 as select * from student; -- 仅导入数据，不会导入其他细节属性 --被创建表不能是外部表 -- 被创建表不支持分区及分桶
   4.3 覆盖插入
           insert overwrite table student3 select * from student;
   4.4 插入带分区的表
           insert into table stu_par partition(month = '08') select id ,name from stu_par where month = '09';
   4.5 将单表中数据导入多表
           from student
           insert into table student4 select *
           insert into table student5 select *;
   4.6 多分区导入单表
           from stu_par
           insert into table stu_par partition(month = '06')
           select id ,name where month = '08'
           insert into table stu_par partition(month = '07')
           select id,name where month = '10';
5.表的修改操作
   5.1 修改表的属性
           alter table stu_ex set TBLPROPERTIES ('EXTERNAL' = 'TRUE');
   5.2 重命名表名
           alter table student4 rename to student3;
   5.3 修改表的serde属性（序列化和反序列化）
       alter table table_name set serdepropertyes('field.delim'='\t');
6.列的更新操作
   6.1 修改列语法
       ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment] [FIRST|AFTER column_name]
   6.2 增加或替换列语法
       ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
       例1：
           增加列：alter table student2 add COLUMNS (score double);
       例2：
           修改列：alter table student2 CHANGE COLUMN score score int AFTER id;
       例3：
           替换列（全部替换）：alter table student2 replace COLUMNS (id int, name string);
7.带有分区的表
   7.0 查看分区
           show partitions table_name;
   7.1 创建单个分区
           create table stu_par(id int, name string)
           partitioned by (month string)
           ROW format delimited
           FIELDS terminated by '\t';
       -- 错误示例
           create table stu_par2(id int, name string)
           partitioned by (id int)
           ROW format delimited
           FIELDS terminated by '\t'; 错！！！！（不能以数据库字段作为分区）
       -- 加载数据到指定分区(分区不存在则自动创建)
           load data local inpath '/opt/module/datas/student.txt' into table stu_par partition(month = '12');
           load data local inpath '/opt/module/datas/student.txt' into table stu_par partition(month = '11');

       -- 合并分区查询结果
           select * from stu_par where month = '11'
           union
           select * from stu_par where month = '12';
   7.2 增加多个分区
       alter table stu_par add partition (month = '08') partition(month='07');
   7.3 删除多个分区
       alter table stu_par drop partition(month='08'),partition(month='09');
   7.4 创建多级分区
       create table stu_par2(id int, name string)
       partitioned by (month string, day string)
       row format delimited
       FIELDS terminated by '\t';
   7.5 导入数据到多级分区
       load data local inpath '/opt/module/datas/student.txt' into table stu_par2
       partition (month='12',day='19');
   7.6 向多级分区增加分区
       alter table stu_par2 add partition(month = '12', day = '17');
   7.7 查询多级分区中的数据
       select * from stu_par2 where day = '18';
   7.8 修复分区（也可以使用添加分区的语句）
       msck repair table dept_partition2;
8.创建外部表（删除表不会删除表中数据，仅删除表的元数据）
   create external table stu_ex2(id int, name string)
   ROW format delimited
   FIELDS terminated by '\t'
   location '/student';
   8.1 外部表与内部表的转换
       alter table stu_ex set TBLPROPERTIES ('EXTERNAL' = 'TRUE');
9.数据的导出
   9.1 导出同时格式化（不加local则导出到hdfs）
       insert overwrite local directory '/opt/module/datas/student'
       row format delimited
       fields terminated by '\t'
       select * from student;
   9.2 hadoop命令导出到本地
       dfs -get /user/hive/warehouse/student/month=201709/000000_0 /opt/module/datas/export/student3.txt;
   9.3 shell命令导出
       hive -f/-e 执行语句或者脚本 > file   -- -f跟文件，-e跟执行语句
   9.4 export仅可以导出到hdfs，常用于hdfs集群hive表迁徙
       export table default.student to '/user/hive/warehouse/export/student'; -- 同时会导出表的元数据
10.数据的导入(仅能导入export导出的数据，因为需要获取表的元数据)
   import table table_name from 'export导出数据的路径';
11.清除表中数据
   truncate table student; -- 只能删除管理表，不能删除外部表中数据
12.Like、RLike：RLike可以使用java的正则表达式
13.group by及having的使用 -- hive中对于使用group by后查询字段仅限group by的字段及聚合函数
   select deptno, avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;
14.mapreduce的join
   14.1 mapreduce中的reducejoin特点：在mapper阶段进行数据关联标记，在reducer阶段进行数据聚合
   14.2 mapreduce中的mapjoin特点：将小表加载到内存中，在mapper阶段根据内存中的数据对大表进行数据处理，没有reduce阶段
15.HQL的join
   15.1 仅支持等值连接不支持非等值连接
       例：不支持select * from A left join B on A.id != B.id;
   15.2 不支持在on条件中使用‘or’
   15.3 每个join都会启动一个mapreduce任务，但hive默认开启mapreduce优化
       关闭mapreduce优化：set hive.auto.convert.join=false;
16.order by
   会进行全局排序，则reduce数量被看作1个，效率低下
17.sort by -- 局部排序
   对于每个mapreduce各分区进行局部排序，分区中的数据随机给定
18.distribute by
   18.1 即mapreduce中自定义分区操作，hql书写规则：先分区后排序
   18.2 distribute by的分区规则是根据分区字段的hash码与reduce的个数进行模除后，余数相同的分到一个区。
19.cluster by
   当distribute和sort字段相同时可用cluster进行替代，默认正序，但不支持desc倒序
20.分桶 -- 分桶表的数据需要通过子查询的方式导入
   20.1 开启分桶的设置
       set hive.enforce.bucketing=true;
   20.2 分桶表的创建
       create table stu_buck(id int, name string)
       clustered by(id)
       into 4 buckets
       row format delimited fields terminated by '\t';
   20.3 分桶的规则
       用分桶字段的hash值与桶的个数求余，来决定数据存放在那个桶，
   20.4 分桶与分区区别
       a. 分桶结果在表的目录下存在多个分桶文件
       b. 分区是将数据存放在表所在目录不同文件路径下
       c. 分区针对是数据存储路径，分桶针对的是数据文件，分桶可以在分区的基础粒度细化
21.分桶的抽样
   21.1 抽样语法 -- 必须 x<=y
       select * from table_name tablesample(bucket x out of y on bucketKey); -- on bucketKey可不写
   21.2 抽样规则
       a. y用来决定抽样比例，必须为bucket数的倍数或者因子，
           例：bucket数为4时，当y=2时，4/2=2,则抽取两个桶的数据，具体抽取哪个桶由x决定
       b. x用来决定抽取哪个桶中的数据
           例1：当bucket=4, y=4, x=2时，则需要抽取的数据量为bucket/y=1个桶，抽取第x桶的数据
           例2：当bucket=4, y=2, x=2时，则需要抽取的数据量为bucket/y=2个桶，抽取第x桶和第x+y桶的数据
           例3：当bucket=12, y=3, x=2时，抽bucket/y=4个桶，抽取第x桶和第x+2y桶的数据
22.NVL函数
   NVL(column_name, default_cvalue),如果该行字段值为null,则返回default_value的值
23.CONCAT_WS()函数
   使用规则：concat_ws(separator, [string | array(string)]+)
   例：select concat_ws('_', 'www', array('achong','com')) 拼接结果：www_achong_com
24.COLLECT_SET(col)函数
   使用规则：仅接受基本数据类型，将字段去重汇总，并返回array类型
   例（行转列）：表结构
           name   xingzuo    blood
           孙悟空   白羊座   A
           大海   射手座   A
           宋宋   白羊座   B
           猪八戒   白羊座   A
           凤姐   射手座   A
       需求：把星座和血型一样的人归类到一起
           射手座,A 大海|凤姐
           白羊座,A 孙悟空|猪八戒
           白羊座,B 宋宋
       查询语句：
           SELECT CONCAT_WS(',', xingzuo, blood), CONCAT_WS('|', COLLECT_SET(NAME))
           FROM xingzuo
           GROUP BY xingzuo, blood
25.EXPLODE(爆炸函数)及LATERAL_VIEW)(侧写函数)
   25.1 explode:将列中的array或者map结构拆分成多行 -- 一般需结合lateral_view使用
   25.2 lateral_view: LATERAL VIEW udtf(expression) 表别名 AS 列别名
   例（行转列）
       select movie, category_name
       from movie_info
       lateral view explode(category) table_tmp as category_name;
26.开窗函数 -- 常结合聚合函数使用，解决即需要聚合前的数据又需要聚合后的数据展示
   26.1 语法：UDAF() over (PARTITION By col1，col2 order by col3 窗口子句（rows between .. and ..）) AS 列别名
           （partition by .. order by）可替换为（distribute by .. sort by ..）
   26.2 over(): 指定分析数据窗口大小
   26.3 窗口子句 -- 先分区在排序然后接rows限定执行窗口
       26.3.01 n PRECEDING：往前n行数据
       26.3.02 n FOLLOWING：往后n行数据
           例：select name, orderdate, cost, sum(cost) over(
                                                               partition by name
                                                               order by orderdate
                                                               rows between 1 PRECEDING and 1 FOLLOWING
                                                           ) from business;
       26.3.03 CURRENT ROW：当前行
       26.3.04 UNBOUNDED PRECEDING 表示从前面的起点
       26.3.05 UNBOUNDED FOLLOWING表示到后面的终点
           例：select name, orderdate, cost, sum(cost) over(
                                                               partition by name
                                                               order by orderdate
                                                               rows between CURRENT ROW and UNBOUNDED FOLLOWING
                                                           ) from business;
27.LAG(col,n,default_val)：往前第n行数据
28.LEAD(col,n, default_val)：往后第n行数据
   例：select name, orderdate, cost, lag(orderdate, 1, 'null') over(partition by name order by orderdate)
       from business; -- 即获取前1行的orderDate数据
29.ntile(n):把有序分区中的行分为n组，每组编号从1开始 -- 分组规则详见:ntile的分组规则.sql
   例：select name,orderdate,cost, ntile(5) over(order by orderdate) num from business
30.Rank函数
   rank() 出现相同排序时，总数不变
   dense_rank() 出现相同排序时，总数减少
   row_number() 不会出现相同排序

sql执行顺序
   from... where...group by... having.... select ... order by...

hql执行顺序
   from … where … group by … having … select … order by … 或
   from … on … join … where … group by … having … select … distinct … order by … limit
   存在开窗函数时，起码在order by之前执行

例题1：-- 集合类型数据导入
   {
       "name": "songsong",
       "friends": ["bingbing" , "lili"] , //列表Array,
       "children": { //键值Map,
           "xiao song": 18 ,
           "xiaoxiao song": 19
       }
       "address": { //结构Struct,
           "street": "hui long guan" ,
           "city": "beijing"
       }
   }

   基于上述数据结构，我们在Hive里创建对应的表，并导入数据。
   1.1 格式化数据为：
       songsong,bingbing_lili,xiao song:18_xiaoxiao song:19,hui long guan_beijing
       yangyang,caicai_susu,xiao yang:18_xiaoxiao yang:19,chao yang_beijing
   1.2 建表语句：
       create table test(name string,
       friends array<string>,
       children map<string, int>,
       address struct<street:string, city:string>)
       row format delimited
       fields terminated by ','
       collection items terminated by '_'
       map keys terminated by ':';
   1.3 数据写入语句
       load data local inpath '/opt/module/datas/test.txt' into table test;
   1.4 查询语句
       select friends[0] friend,children['xiao song'] age,address.city from test where name = 'songsong';