hive-数据库操作-表的操作-数据的导入/写入-窗口查询

最新推荐文章于 2024-06-21 00:22:53 发布

LIUERTOU

最新推荐文章于 2024-06-21 00:22:53 发布

阅读量666

点赞数

分类专栏： hive 文章标签： hive

本文链接：https://blog.csdn.net/LIUERTOU/article/details/120807741

版权

hive 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

数据库操作

数据库的创建

-- 数据库的创建
create database python;

-- 指定数据库的备注和属性和位置
create database python_db comment 'python_database' location '/hive2' with dbproperties ('name' = 'python');

数据库的查看和删除

-- 查看数据库
desc database extended python;
-- 查看数据库创建过程
show create database python_db;

-- 数据库删除
drop database python_db;
-- 强制删除有表的数据
drop database python cascade;

数据库的修改

-- 修改数据库
-- 修改数据库属性
alter database python set dbproperties ('age' = '18');
-- 修改数据库路径-指定HDFS上的路径
alter database python set location 'hdfs:///hive2';
-- 修改拥有者
alter database python set owner user python;

数据表操作

表的创建


-- 表的基本创建,默认采用的分隔符为\001 在vim中显示^A 在编辑器里是SOH
create table if not exists test_tb
(
    id int comment 'idhao'
);

-- 指定分割符创建
create table if not exists test2_db
(
    id     int,
    name   string,
    age    int,
    gender string
) row format delimited fields terminated by ',';

查看表删除表

-- 查看表
-- 详情信息查看
desc extended test2_db;
-- 格式化展示详细信息
desc formatted test2_db;
-- 查看建表语句
show create table test2_db;

-- 删除表(元数据和文件)
drop table test2_db;
-- 删除表数据,不删除表目录
truncate table test2_db;

修改表


-- 修改表
-- 修改表属性
alter table test2_db set tblproperties ('name' = 'itcast');
-- 修改表名
alter table test2_db rename to test3_tb;
-- 修改表字段属性
alter table test3_tb change id id string;
-- 修改name字段名
alter table test3_tb change name username string;
-- 添加address新字段
alter table test3_tb add columns (address string);
-- 修改表的存储路径到根路劲hive2下
alter table test3_tb set location '/hive2';

内部表和外部表的创建


-- 内部表的创建,删除内部表元数据和文件全部删除
create table test4_tb_nei
(
    id   int,
    name string
) row format delimited fields terminated by ',';

-- 外部表的创建,删除外表只删除表元数据,并指定外表表文件路径
create external table test4_tb_nei
(
    id   int,
    name string
) row format delimited fields terminated by ','
    location '/hive2';

分区表

-- 创建分区表
create table test_tb_part
(
    id     int,
    name   string,
    age    int,
    gender string
) partitioned by (sex string) row format delimited fields terminated by ',';

-- 静态分区表导入
load data local inpath '/root/boy.txt' into table test_tb_part partition (sex = 'boy');
load data local inpath '/root/girl.txt' into table test_tb_part partition (sex = 'girl');

#-------------------------------------------

-- 动态分区,可以根据指点的字段进行动态分区
-- 创建动态分区表
create table test_tb_part_dong(
    id     int,
    name   string,
    age    int,
    gender string
) partitioned by (sex string) row format delimited fields terminated by ',';

-- 设置允许动态分区写入
set hive.exec.dynamic.partition.mode=nonstrict;

-- 插入动态分区设置
insert into table test_tb_part_dong partition (sex) select t.* t.gender from test_tb as t;

分区表的查看/删除/修改

-- 查看表中分区的内容
show partitions test_tb_part;

-- 删除分区
alter table test_tb_part drop partition (sex = 'girl');

-- 分区的修改,添加新的分区
alter table test_tb_part add partition (sex = 'aaa');

-- 修改分区名称
alter table test_tb_part partition (sex = 'aaa') rename to partition (sex = 'bbb');

-- 修改分区路径
alter table test_tb_part partition (sex = 'bbb') set location '/.....';

-- 分区的修复(在HDFS上创建了分区,但元数据中没有,需要修复一致)
msck repair table test_tb_part;

分桶表


-- 分桶表
-- 创建原始数据表
create table if not exists t_usa_covid19
(
    count_date string,
    county     string,
    state      string,
    fips       int,
    cases      int,
    deaths     int
)
    row format delimited fields terminated by ',';


-- 创建分通表
create table if not exists bucket_usa_covid19
(
    count_date string,
    county     string,
    state      string,
    fips       int,
    cases      int,
    deaths     int
) clustered by (state) sorted by (cases desc) into 5 buckets;

-- 将原始数据表进行分桶操作
insert into bucket_usa_covid19
select *
from t_usa_covid19;

数据的导入/写入

-- 数据的导入
-- 导入本地数据
load data local inpath '/root/boy.txt' into table test_tb_part partition(sex='boy');

-- HDFS上数据的导入
load data inpath '/hive2/a.txt' into table test_tb_part partition (sex = 'bbb');

-- 覆盖导入
load data inpath '/hive2/b.txt' overwrite into table test_tb_part partition (sex = 'bbb');

-- insert into插入数据
insert into team_player values (11, '石家庄刘', '清新');


-- 批量将一张表的数据导入另一张表中
-- insert into ....select....插入数据(是追加导入数据)
insert into test_tb
select *
from test_tb_new;

-- insert overwrite ....select....插入数据(是覆盖导入数据)
insert overwrite table test_tb
select *
from test_tb_new;

-- 多批次导入,将数据导入多张表
from test_tb_new
insert
into test_tb_name
select name
insert
into test_tb_age
select age;

-- 动态导入,t.gender 最后一个字段作为分区导入时的字段划分依据
insert into table test_tb_part_D partition(sex) select t.*,t.gender from test_tb_f  t;

-- 多个分区字段导入
insert into table test_tb_part_D partition(yesr,month) select t.*, nian,yue from test_tb_f  t;

-------------------------------------------

-- 文件的写入,把文件写入到路径/hive2下(默认的格式\001)
insert overwrite directory '/hive2'
select *
from test_tb_new;

-- 按照指定的格式写入
insert overwrite directory '/hive2' row format delimited fields terminated by '-'
select *
from test_tb_new;
-- 按照指定的格式写入到本地
insert overwrite local directory '/hive2' row format delimited fields terminated by '-'
select *
from test_tb_new;

DQL操作


-- 创建分区表
CREATE TABLE t_usa_covid19_p
(
    county string,
    fips   int,
    cases  int,
    deaths int
)
    partitioned by (count_date string,state string)
    row format delimited fields terminated by ",";
    
-- 设置动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
-- 导入分区数据
insert into t_usa_covid19_p partition (count_date, state)
select county, fips, cases, deaths, count_date, state
from t_usa_covid19;

基本查询的执行顺询(from - join on – where-- group by-- having–order by – 展示select查询字段–limit分页展示)
where是在分组之前筛选条件, having是在分组后的筛选条件


-- 按条件查询
select * from t_usa_covid19_p where state = 'Alabama'and deaths > 100;

-- 对数据去重查询
select distinct county from t_usa_covid19_p;

-- 分组查询
select state, sum(deaths)
from t_usa_covid19_p
group by state;

-- 分组数据过滤
select state, sum(deaths)
from t_usa_covid19_p
group by state
having sum(deaths) > 10000;

-- 排序
select * from t_usa_covid19_p where state='Alabama' order by deaths desc ;

-- 分页处理,展示5条数据
select *
from t_usa_covid19_p
where state = 'Alabama'
order by deaths desc
limit 5;

-- 分页处理,从第三行开始,展示5条数据
select *
from t_usa_covid19_p
where state = 'Alabama'
order by deaths desc
limit 3,5;

高阶查询

order by 是全局排序,强制对全局使用一个reduce进行排序
cluster 默认情况下只是用一个reduce进行排序,跟order by效果一样


#cluster by 查询
select * from t_usa_covid19_p where state = 'Alabama' cluster by deaths;

-- 可以手动设置reduce的数量,会分成两份数据排序展示
set mapreduce.job.reduces = 2;

select * from t_usa_covid19_p where state = 'Alabama' cluster by deaths;

排序的第二种方式
distribute by 指定拆分字段 sort by 指定排序字段

select * from t_usa_covid19_p where state = 'Alabama' distribute by deaths sort by cases;

将两个字段相同的表进行联合, distinct可以去除联合后的重复数据(类似于join)

select *
from test_tb_un
union
distinct
select *
from test_tb_f;

CTE语法

-- with as 子查询
with tb1 as (select * from test_tb)
select *
from tb1;

-- 多个子查询
with tb1 as (select * from test_tb),
     tb2 as (select * from test_tb_un)
select *
from tb1
union
select *
from tb2;

关联查询

-- 创建三个员工表
CREATE TABLE employee(
   id int,
   name string,
   deg string,
   salary int,
   dept string
 ) row format delimited
fields terminated by ',';

--table2:员工住址信息表
CREATE TABLE employee_address (
    id int,
    hno string,
    street string,
    city string
) row format delimited
fields terminated by ',';

--table3:员工联系方式表
CREATE TABLE employee_connection (
    id int,
    phno string,
    email string
) row format delimited
fields terminated by ',';

-- 内关联 join查询,查询两个表共同的信息
select *
from employee as e1
         join employee_address as e2 on e1.id = e2.id;

-- 左连接left join
select *
from employee as e1
         left join employee_address e2 on e1.id = e2.id;

-- 右连接right join
select *
from employee e1
         right join employee_address e2 on e1.id = e2.id;

-- 全外连接full join
select *
from employee e1
         full join employee_address e2 on e1.id = e2.id;

-- 左半开连接left semi, 先找两张表相同的部分,再进行左连接展示
select *
from employee e1
     left semi join employee_address e2 on e1.id = e2.id;

-- 交叉连接 cross join
select *
from employee
         cross join employee_address;

窗口函数

-- 用窗口函数求
SELECT cookieid,
       sum(pv) OVER(PARTITION BY cookieid)
FROM website_pv_info;

-- 求出每个用户截止到当天，累积的总pv数(order by 默认有一个计算范围 unbounded preceding)
SELECT cookieid,
       pv,
       sum(pv) OVER(PARTITION BY cookieid
                    ORDER BY createtime)
FROM website_pv_info;

-- 找出每个用户访问pv最多的Top3重复并列的不考虑
-- 排序  rank 遇到重复会生成不连续的序号
SELECT *,
       rank() over(PARTITION BY cookieid
                   ORDER BY pv DESC) AS runk_num
FROM website_pv_info;

-- dense_rank遇到重复会生成连续序号
SELECT *,
       dense_rank() over(PARTITION BY cookieid
                   ORDER BY pv DESC) AS runk_num
FROM website_pv_info;

 -- row_number生成行号
 SELECT *,
       row_number() over(PARTITION BY cookieid
                   ORDER BY pv DESC) AS runk_num
FROM website_pv_info;

-- 用子查询查询前三名
WITH pv1 as (SELECT *,
       rank() over(PARTITION BY cookieid
                   ORDER BY pv DESC) AS runk_num
FROM website_pv_info)
SELECT * FROM pv1 WHERE runk_num <= 3;

-- 统计每个用户pv数最多的前3分之1天,ntile(3)把每一组中平均分成3分
SELECT *,
       ntile(3) over (partition BY cookieid
                      ORDER BY pv DESC) AS rank_num
FROM website_pv_info;
-- 子查询查找三分之一的用户
WITH tpv2 AS
  (SELECT *,
          ntile(3) over (partition BY cookieid
                         ORDER BY pv DESC) AS rank_num
   FROM website_pv_info)
SELECT *
FROM tpv2
WHERE rank_num = 1;

指定取值范围

-- 从当前行向下查找3行(当前行也算相当于4行相加)
SELECT *,
       sum(pv) OVER(PARTITION BY cookieid
                    ORDER BY createtime ROWS BETWEEN CURRENT ROW AND 3 FOLLOWING)
FROM website_pv_info;

-- 以当前行为基准向上取2行向下取3行
SELECT *,
       sum(pv) OVER(PARTITION BY cookieid
                    ORDER BY createtime ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING)
FROM website_pv_info;

分析函数

--分析函数
-- 向上取值
SELECT *,
       lag(pv) OVER(PARTITION BY cookieid
                    ORDER BY pv DESC) AS runk_n
FROM website_pv_info;

-- 向下取值
SELECT *,
       lead(pv) OVER(PARTITION BY cookieid
                    ORDER BY pv DESC) AS runk_n
FROM website_pv_info;

-- 取第一个值
SELECT *,
       first_value(pv) OVER(PARTITION BY cookieid
                    ORDER BY pv DESC) AS runk_n
FROM website_pv_info;

-- 取最后一个值(需要指定取值范围全局)
SELECT *,
       last_value(pv) OVER(PARTITION BY cookieid
                           ORDER BY pv DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS runk_n
FROM website_pv_info;