浅谈Hive之第二讲

1、克隆表,不带数据
create table if not exists t5 like t4;

CREATE TABLE `t5`(
  `uid` int, 
  `uname` string)
ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ',' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://gp191801:9000/user/hive/warehouse/gp1918.db/t5'
TBLPROPERTIES (
  'transient_lastDdlTime'='1554084875')
  
  CREATE TABLE `t4`(
  `uid` int, 
  `uname` string)
ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ',' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://gp191801:9000/user/hive/warehouse/gp1918.db/t4'
TBLPROPERTIES (
  'COLUMN_STATS_ACCURATE'='true', 
  'numFiles'='1', 
  'numRows'='2', 
  'rawDataSize'='23', 
  'totalSize'='25', 
  'transient_lastDdlTime'='1553168093')
克隆表,带数据:
 create table if not exists t6 like t4 LOCATION '/user/hive/warehouse/gp1918.db/t4' ;

克隆表,带数据:(更灵活的方式)
跟创建表一样,元数据和表目录都会创建

create table t8
as 
select uid,uname,1 as sex
from t4
;
查看库的描述:
desc/describe database [extended] gp1918;
查看表的描述:
desc/describe [extended] t4;
show create table t4;   //比较全

案例:

CREATE TABLE log(
id             string COMMENT 'this is id column',
phonenumber     bigint,
mac             string,
ip               string,
url              string,
status1          string,
status2           string,
upflow            int,
downflow         int,
state           string,
dt string
)
COMMENT 'this is log table'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
LINES TERMINATED BY '\n'
stored as textfile
;

加载数据:

load data local inpath '/hivedata/data.log.txt' into table log;

第一个需求,求每个电话号码的上行、下行、总流量

select
l.phonenumber,
sum(l.upflow) as sumupflow,
sum(l.downflow) as sumdownflow,
sum(l.upflow + l.downflow) as sumflow
from log l
group by l.phonenumber
;

hive运行本地模式:

set hive.exec.mode.local.auto=true;

第二个需求,求访问排名前三的website(url):

select 
l.url website,
count(l.url) as webcount
from log l
group by l.url
order by webcount desc 
limit 3
;

TOPN:
第三个需求,模拟收费:

表属性的修改:
1、修改表名 rename to ???是否comment

alter table t7 rename to t_user;

2、修改列名change column

alter table t8 change column sex usex string;

3、修改列的位置:(注意:修改列的位置不会将数据的位置也同样修改)

alter table t8 change column uname uname string after usex;
alter table t8 change column uname uname string first;

4、修改数据类型:

alter table t8 change column usex usex int;

5、增加字段add columns

alter table t8 add columns(
uage int,
uaddr string
)
;

6、删除字段replace columns (先删除表,后重新创建表)

alter table t8 replace columns(
`uname` string, 
`uid` int, 
`uaddr` string
)
;

7、内部表和外部表的转换

alter table t4 set tblproperties("EXTERNAL"="TRUE");#TRUE一定要大写
alter table t4 set tblproperties("EXTERNAL"="false");#false大小写都没关系

显示当前库:

set hive.cli.print.current.db=true;

删除表:

drop table if exists t4;

删除库:

drop database if exists gp1918;#只能删除一个空库
drop database if exists gp1918 cascade;#强制删除

------------------分区------------------
为什么要分区
随着系统运行时间越来越长,表的数据量越来越大,而hive的查询通常使用全表扫描,这样会导致大量不必要的数据扫描,从而大大降低查询效率。从而引进分区技术,使用分区技术,避免全表扫描,提升查询效率,可以将用户的整个表数据在存储上划分成多个的子目录(子目录的命名以分区变量的值来命名)age=19 age=20 age=21

分区的技术
怎么分区,一般根据业务需求,通常使用年、月、日或者地区等作为分区值
[PARTITIONED BY (COL_NAME DATA_TYPE)]

  1. hive的分区名区分大小写的
  2. hive分区的本质是在表目录下创建分区目录,但是该分区字段是一个伪列,不真实存在于数据中。
  3. 一张表可以有一个或者多个分区,分区下面同样可以有一个或者多个分区

分区的意义
可以让用户在做数据统计的时候缩小数据扫描的范围,因为可以在select时指定要统计的分区,从而提高查询效率

操作:
创建一个一级分区:

create table if not exists part1(
uid int,
uname string,
uage int 
)
PARTITIONED BY (country string)
row format delimited 
fields terminated by ','
stored as textfile
;

分区表数据导入方式:

load data local inpath '/hivedata/china_person.txt' into table part1 partition(country='china')
;

创建二级分区:

create table if not exists part2(
uid int,
uname string,
uage int 
)
PARTITIONED BY (year string,month string)
row format delimited 
fields terminated by ','
stored as textfile
;

加载数据:

load data local inpath '/hivedata/china_2019-04.txt' into table part2 partition(year='2019',month='04')
;

load data local inpath '/hivedata/china_2019-03.txt' into table part2 partition(year='2019',month=03)
;

load data local inpath '/hivedata/japan_2018-04.txt' into table part2 partition(year='2018',month='04')
;

创建3级分区:

create table if not exists part3(
uid int,
uname string,
uage int 
)
PARTITIONED BY (year string,month string,day string)
row format delimited 
fields terminated by ','
stored as textfile
;

显示分区:

show partitions part2;

修改分区:

  1. 分区名字(没有命令进行修改,手动修改,既要修改hdfs上的目录名,又要修改元数据)**

  2. 增加分区

alter table part1 add partition(country='korea'); 
  1. 增加分区并指定数据:**
alter table part1 add partition(country='india') LOCATION '/user/hive/warehouse/gp1918.db/part1/country=india'
;

增加多个分区:

alter table part1 add partition(country='yenan') 
 partition(country='xinjiapo') 
;

修改分区的指定的目录:

alter table part1 partition(country='yenan') set location 'hdfs://gp191801:9000/user/hive/warehouse/gp1918.db/part1/country=india'
;   //修改分区的路径时,需要指定的hdfs路径是绝对路径。

删除分区:

alter table part1 drop partition(country='xinjiapo');

删除多个分区:

alter table part1 drop partition(country='yenan'),partition(country='india');

分区的类型
*静态分区:*加载数据到指定的分区的值
*动态分区:*数据未知,根据分区的值确定创建分区
*混合分区:*静态和动态都有

动态分区的设置:

hive.exec.dynamic.partition=true/false
hive.exec.dynamic.partition.mode=strict/nostrict
hive.exec.max.dynamic.partitions=1000
hive.exec.max.dynamic.partitions.pernode=100

create table if not exists dy_part1(
uid int,
uname string,
uage int
)
partitioned by (year string,month string,day string)
row format delimited 
fields terminated by ','
;

加载数据:(动态分区不能使用load方式加载数据)
load data local inpath '/hivedata/china_2019-03' into table partition(year=2019,month,day)
;

先创建临时表

create table if not exists part_tmp(
uid int,
uname string,
uage int,
year string,
month string,
day string
)
row format delimited 
fields terminated by ','
;

将数据加载到临时表

insert into table part_tmp
select uid,uname,uage,year,month,1 as day from part2;

然后,将临时表的数据动态写入到分区表

insert into table dy_part1 partition(year,month,day)
select * 
from part_tmp
;

混合分区:(注意字段的个数匹配)

create table if not exists dy_part2(
uid int,
uname string,
uage int
)
partitioned by (year string,month string,day string)
row format delimited 
fields terminated by ','
;

导入数据:

insert into table dy_part1 partition(year=2019,month,day)
select uid,uname,uage,month,day 
from part_tmp
where year='2019'
;

设置hive为严格模式执行:

<property>
    <name>hive.mapred.mode</name>
    <value>nonstrict</value>
    <description>
      The mode in which the Hive operations are being performed. 
      In strict mode, some risky queries are not allowed to run. They include:
        Cartesian Product.
        No partition being picked up for a query.
        Comparing bigints and strings.
        Comparing bigints and doubles.
        Orderby without limit.
    </description>
  </property>

注意事项:

  1. hive的分区表使用的时表外字段,分区字段是一个伪列,但是可以做查询过滤
  2. 分区字段不建议使用中文
  3. 不太建议使用动态分区。因为动态分区将会使用mapreduce来进行查询数据,如果分区数量过多将会导致namenode和yarn的性能瓶颈。所以建议动态分区前要尽可能的预知分区数量
  4. 分区属性修改均可以手动(元数据和hdfs的数据内容)

----------------------分桶----------------------
为什么有分桶
单个分区或者表中的数据量越来越大,当分区不能细粒度划分数据时,所有要采用分桶技术来将数据更加细粒度的划分和管理。

分桶的技术

CLUSTERED BY (COL_NAME,COL_NAME...)
SORTED BY (COL_NAME [ASC|DESC]) INTO NUM_BUCKETS BUCKETS

分桶关键字:bucket
默认采用对分桶字段进行hash值%总桶数的余数就是分桶的桶数

分桶的意义

  1. 为了保存分桶查询结果的分桶结构(数据按照分桶字段进行了hash散列)
  2. 分桶表主要用在两个应用场景:进行抽样和JOIN时可以提高mr程序的运行效率

分桶表的使用:

  1. 首先创建一个带分桶定义的表(分桶表)
  2. 然后从原始数据(普通表)中使用分桶查询将分桶数据插入到分桶表中

创建表分桶:

create table if not exists buc1(
uid int,
uname string,
uage int 
)
clustered by(uid) into 4 buckets
row format delimited 
fields terminated by ','
;

加载数据:(load方式加载数据不能体现分桶结果)

load data loacal inpath '/hivedata/china_person.txt' into table buc1;

创建分桶临时表

create table if not exists buc_tmp(
uid int,
uname string,
uage int 
)
row format delimited 
fields terminated by ','
;

1,gyy01,18
2,gyy02,18
3,gyy03,18
4,gyy04,18
5,gyy05,18
6,gyy06,18
7,gyy07,18
8,gyy08,18
9,gyy09,18
10,gyy10,18
11,gyy11,18
12,gyy12,18
13,gyy13,18
14,gyy14,18
15,gyy15,18
16,gyy16,18
17,gyy17,18
18,gyy18,18
19,gyy19,18
20,gyy20,18
21,gyy21,18
22,gyy22,18

执行分桶查询语句:

select * from buc_tmp cluster by (uid);

设置强制分桶属性:

<property>
    <name>hive.enforce.bucketing</name>
    <value>false</value>
    <description>Whether bucketing is enforced. If true, while inserting into the table, bucketing is enforced.</description>
  </property>
set hive.enforce.bucketing=false/true

如果reducer的个数和分桶个数不一样的时候,请手动设置:

set mapreduce.job.reduces=4
create table if not exists buc2(
uid int,
uname string,
uage int 
)
clustered by(uid) into 6 buckets
row format delimited 
fields terminated by ','
;

导入数据到分桶表:

insert overwrite table buc2 
select * from buc_tmp
cluster by (uid)
;

创建分桶表,指定排序字段以及排序规则

create table if not exists buc3(
uid int,
uname string,
uage int 
)
clustered by(uid) 
sorted by (uid desc) into 6 buckets
row format delimited 
fields terminated by ','
;

导入数据:

insert overwrite table buc3 
select * from buc_tmp
distribute by (uid) sort by (uid asc)
;

查询分桶数据:
查询全部数据:

select * from buc3;
select * from buc3 tablesample(bucket 1 out of 1);

查询6桶中的第一桶:

select * from buc3 tablesample(bucket 1 out of 6 on uid);

select * from buc3 tablesample(bucket 1 out of 2 on uid);

select * from buc3 tablesample(bucket 1 out of 12 on uid);


select * from buc3 tablesample(bucket 1 out of 7 on uid);

tablesample(bucket x out of y on uid)
//x:代表从第几桶开始查询
//y:查询总的桶数,y可以是总桶数的倍数或者是因子;x不能大于y

不压缩不拉伸:

bucket 1 out of 6 on uid
while 
1 2 3 4 5 6 
1            7
1	1+6

压缩:

bucket 1 out of 2 on uid
1 2 3 4 5 6
while 
for i=1 to 2

1  3  5 
1  1+2(2*1) 1+2+2(6/2 - 1)

拉伸:

(bucket 7 out of 12 on uid);
1 2 3 4 5 6
1           7
6
12
18

查询id为奇数的数据:

select * from buc3 tablesample(bucket 2 out of 2 on uid)
;

查询表的前3条数据:

select * from buc3 limit 3;
select * from buc_tmp tablesample(3 rows);
select * from buc_tmp tablesample(12 percent);
select * from buc_tmp tablesample(30B);  k、M\G

随机获取3条数据:

select *
from buc_tmp
order by rand() 
limit 3;

分桶总结:
1、定义
clustered by (uid) – 指定分桶的字段
sorted by (uid desc) --指定数据的排序规则,表示预期的数据就是按照这种规则进行排序

2、导数据
cluster by (uid) – 指定getPartition使用哪个字段来进行hash散列,并且排序字段也是指定的字段,同时指定排序规则,默认按照正序进行排列,使用更灵活:
distribute by (uid) --指定getPartition使用哪个字段来进行hash散列
sort by (uid asc) – 指定排序字段 ,以及排序规则
order by:全局有序

cluster by (uid) 和 distribute by (uid) sort by (uid asc)
查询的结果是一致的

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值