Hive索引

xiuxiuhh

已于 2023-06-15 17:38:43 修改

阅读量1k

点赞数 6

文章标签： hive hadoop 数据仓库

于 2023-06-15 17:33:03 首次发布

本文链接：https://blog.csdn.net/xiuxiuhh/article/details/130861648

版权

1 索引机制

数据准备
1、行组索引：主要目的，实现对范围查询进行优化
第一步：准备数据集，载入搜狗数据，创建一张普通表

create table tb_sogou(
  s_time string,
  uuid string,
  keywords string,
  rank string,
  url string
)row format delimited fields terminated by '\t';
-- 本地导入
load data local inpath '/export/data/sogou.tsv' into table tb_sogou;

-- 第二步：基于数据集在创建一个数据表，开启行组索引
-- ① 必须采用ORC存储格式 ② 开启行组索引 ③ 范围查询字段，在建表时，必须有序
create table tb_sogou_index1 stored as orc tblproperties('orc.create.index'='true') as
select substr(s_time,1,2) as hour, uuid, keywords, rank, url
from tb_sogou sort by hour;
-- 查看表数据
select * from default.tb_sogou_index1 limit 20;

在Hub web UI里作业浏览器中查看任务执行情况
在这里插入图片描述

-- 第三步：没有开启行组索引效果
set hive.optimize.index.filter = false;
-- 举个栗子，查询12 ~ 14点这个范围内，一共有多少次搜索
select count(*) from tb_sogou_index1 where hour >= 12 and hour <= 14;

在这里插入图片描述

-- 第四步：开启了行组索引效果
set hive.optimize.index.filter = true;
select count(*) from tb_sogou_index1 where hour >= 12 and hour <= 14;

在这里插入图片描述

2 布隆索引

布隆索引 => 等值查询
第一步：创建一个普通ORC数据表，没有使用布隆索引

create table tb_sogou_index2 stored as orc as
select
    s_time, uuid, keywords, rank, url
from tb_sogou;

第二步：使用等值查询，通过explain查询计划，查看执行情况

explain select count(*) from tb_sogou_index2 where uuid = '4945535469948956';

结果如下：

STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1
""
STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: tb_sogou_index2
            filterExpr: (uuid = '4945535469948956') (type: boolean)
            Statistics: Num rows: 1724264 Data size: 874201848 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (uuid = '4945535469948956') (type: boolean)
              Statistics: Num rows: 862132 Data size: 437100924 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                Statistics: Num rows: 862132 Data size: 437100924 Basic stats: COMPLETE Column stats: NONE
                Group By Operator
                  aggregations: count()
                  mode: hash
                  outputColumnNames: _col0
                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                  Reduce Output Operator
                    sort order: 
                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                    value expressions: _col0 (type: bigint)
      Execution mode: vectorized
      Reduce Operator Tree:
        Group By Operator
          aggregations: count(VALUE._col0)
          mode: mergepartial
          outputColumnNames: _col0
          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
          File Output Operator
            compressed: false
            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
            table:
                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
""
  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

第三步：基于要查询的uuid字段，简历布隆索引

create table tb_sogou_index3 stored as orc tblproperties('orc.bloom.filter.columns'='uuid', 'orc.bloom.filter.fpp'='0.05') as
select
    s_time, uuid, keywords, rank, url
from tb_sogou;

– 第四步：使用等值查询，通过explain查询计划，查询是否有走布隆索引

explain select count(*) from tb_sogou_index3 where uuid = '4945535469948956';

结果如下：

STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1
""
STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: tb_sogou_index3
            filterExpr: (uuid = '4945535469948956') (type: boolean)
            Statistics: Num rows: 1724264 Data size: 874201848 Basic stats: COMPLETE Column stats: NONE
            Filter Operator
              predicate: (uuid = '4945535469948956') (type: boolean)
              Statistics: Num rows: 862132 Data size: 437100924 Basic stats: COMPLETE Column stats: NONE
              Select Operator
                Statistics: Num rows: 862132 Data size: 437100924 Basic stats: COMPLETE Column stats: NONE
                Group By Operator
                  aggregations: count()
                  mode: hash
                  outputColumnNames: _col0
                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                  Reduce Output Operator
                    sort order: 
                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                    value expressions: _col0 (type: bigint)
      Execution mode: vectorized
      Reduce Operator Tree:
        Group By Operator
          aggregations: count(VALUE._col0)
          mode: mergepartial
          outputColumnNames: _col0
          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
          File Output Operator
            compressed: false
            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
            table:
                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
""
  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink