hadoop书籍(二)—《hive权威指南》笔记

hive 权威
1、创建数据库
create database if not exists financials
comment ‘hold all financial tables’
with dbproperties(‘creator’=‘wuwc’,‘date’=‘2019-12-09’);

2、设置变量和属性
set hive.cli.print.current.db=true;

3、删除数据库
drop database if exists financials cascade;

4、修改数据库
alter database financials set dbproperties(‘updated’=‘greekw’)

5、创建表

  • 内部表
    create table if not exists employees(
    name STRING,
    salary FLOAT,
    subordinates ARRAY,
    deductions MAP<STRING,FLOAT>
    )
    row format delimited
    fields terminated by ‘\001’
    collection items terminated by ‘\002’
    map keys terminated by ‘\003’
    line terminated by ‘\n’
    comment ‘员工表’
    tblproperties(‘creator’=‘wuwc’,‘date’=‘2019-12-09’)
    location ‘/user/hive/warehouse/financials.db/employees’;
  • like创建表
    create table if not exists employees02
    like financials.employees;

create external table if not exists financials.stocks
(
stock_code STRING,
symbol STRING,
price_open FLOAT,
price_high FLOAT,
price_low FLOAT,
price_close FLOAT
)
row format delimited fields terminated by ‘,’
location ‘/data/stocks’;

  • 外部表

6、查看表
describe formatted employees;

7、分区表和管理表

  • 内部分区表
    create table if not exists employees_part(
    name STRING,
    salary FLOAT,
    subordinates ARRAY,
    deductions MAP<STRING,FLOAT>
    )
    partitioned by (country string,state string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’ ;

  • 外部分区表
    create external table if not exists log_msg(
    hms int,
    serverity string,
    server string,
    processId int,
    msg string
    )
    partitioned by (year int,month int,day int)
    row format delimited fields terminated by ‘\t’;

修改表
1、表重命名
alter table log_msg rename to log_msg_new

2、增删改表的分区
alter table log_msg add if not exists
partition by(year=2019,month=12) location ‘/logs/2019/12’;

alter table log_msg drop if exists partition(year=2019,month=12);

3、增删改列
alter table log_msg add columns(
app_name string comment ‘应用名称’
)

4、修改表

  • 修改表属性

    • alter table table_name set tblproperties(‘key’=‘value’);
  • 修改表其他属性

  • alter table … achive/unachive/touch partition; 分区操作

  • alter table table_name partition(year=2019,month=12) enable no_drop; 开启分区保护

数据操作

create table t10(

id int

,name string

,hobby array

,add map<String,string>

)

partitioned by (pt_d string,sex string)

row format delimited

fields terminated by ‘,’

collection items terminated by ‘-’

map keys terminated by ‘:’

;

create table t1(

id int

,name string

,hobby array

,add map<String,string>

)

partitioned by (pt_d string)

row format delimited

fields terminated by ‘,’

collection items terminated by ‘-’

map keys terminated by ‘:’

;

装载数据

从本地文件系统导入数据
从HDFS中导入数据
从其他的Hive表中导入数据
创建表的同时导入数据

  • load data local inpath ‘本地路径’ overwrite into table table_name partition();
    正确操作:
    load data local inpath ‘/data/hive’ overwrite into table log_msg partition(year=2019,month=12,day=10);

insert overwrite table log_msg partition(year=2019,month=12,day=11) select * from log_msg;

Need to specify partition columns because the destination table is partitioned

Partition not found ‘12’

  • load data inpath ‘分布式文件路径’ overwrite into table table_name partition();

  • insert overwrite table table_name partition() select *from table_name;

  • 动态分区和静态分区,静态分区必须在动态分区前;

      - hive.exec.dynamic.partition=true:开启动态分区;
      
      - hive.exec.dynamic.partition.mode=nostrict: 容许所有分区都是动态;
      
      - hive.exec.max.dynamic.partitions.pernode=100:每个mapper或reduce创建的最大动态分区个数;
      
      - hive.exec.max.dynamic.partitions=1000:最大可创建动态分区个数;
      
      - hive.exec.max.created.files=1000:全局可以创建的最大文件个数;
    

from stu
insert into table student01 partition(city=‘beijing’) select id, sname, age where city=‘beijing’
insert into table student02 partition(city=‘shanghai’) select id, sname, age where city=‘shanghai’;

  • 创建表
    create external table if not exists student(
    id string,
    sname string,
    age int
    )
    row format delimited fields terminated by ‘,’;

create table if not exists student_hdfs(id string,sname string,age int)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;

create table if not exists student_insert(id string,sname string,age int)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;

  • 从本地加载数据
    load data local inpath ‘/data/hive/student.txt’ overwrite into table student;

  • 从hdfs

load data inpath ‘/user/hdfs/student.txt’ overwrite into table student_hdfs;

  • 错误:
    Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask

  • insert 方式
    insert overwrite table student_insert select * from student;

  • from insert方式
    from student insert into table student_insert select id,sname,age where age>28;

  • 创建分区表
    create external table if not exists student_part(id string,sname string,age int)
    partitioned by (city string)
    row format delimited fields terminated by ‘,’
    lines terminated by ‘\n’;

create external table if not exists student__double_part(id string,sname string,age int)
partitioned by (country string,city string)
row format delimited fields terminated by ‘,’
lines terminated by ‘\n’;

  • 添加分区

ALTER TABLE student_part ADD if not exists PARTITION(city=‘beijing’) location ‘分区位置’

FAILED:SemanticException table is not partitioned but partition spec exists:{address=china}
在新建表的时候,并没有创建分区列address,所以只有在存在分区列的表上执行增加分区的操作,才会成功。

  • 加载分区数据
    ALTER TABLE student_part ADD if not exists PARTITION(city=‘shanghai’)
  • load data local inpath ‘/data/hive/student.txt’ overwrite into table student_part partition(city=‘beijing’);
  • insert 方式
    from student
    insert into table student_part partition(city=‘guangdong’) select id,sname,age
    insert into table student_part partition(city=‘chongqing’) select id,sname,age;

from student
insert into table student_double_part partition(country='china,'city=‘chongqing’) select id,sname,age;

  • 修复分区

    dfs -mkdir /user/hive/warehouse/financials.db/student__double_part/country=japan/city=tokoy;

    • msck repair table table_name;
  • 查看分区

    • show partitions table_name;
  • 动态分区

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;

from student_part

insert into table student_part partition(city) select id, sname, age, city

insert into table student_part partition(city) select id, sname, age, city ;

  • 静态和动态混合的

  • 创建表的同时导入数据

    • create table student_part_create as select * from student_part;
  • 分桶:分区和分桶都是按字段来组织数据的存放,
    分区是相同的字段值存放在一个文件中,而分桶是字段哈希值相同的数据存放在一个文件中。
    create table student_cluster( id string, sname string, age int)
    clustered by(id) into 2 buckets
    row format delimited fields terminated by ‘,’
    lines terminated by ‘\n’;

  • 数据导出

  • 导出到本地
    insert overwrite directory ‘’ select * from student where age>28;

insert overwrite local directory ‘/data/hive/student_export.txt’ select * from student where age>28;

  • 导出到hdfs

http://blog.leanote.com/post/yongjian3311@163.com/Hive%E4%BF%AE%E6%94%B9%E8%A1%A8%E7%BB%93%E6%9E%84

####hive查询

case …when … then
select id,sname,age,
case
when age>30 then ‘old’
when age>30 and age<20 then ‘mid’
else ‘young’
end as level from student;

  • hive join查询

    • left/right/full/笛卡尔积/

    • 关闭笛卡尔积:set hive.mapred.mode=strict

    • map-side join查询

  • order by /sort by等

    • set hive.mapred.mode=strict 则使用order by 需要加上limit

    • order by 全局排序,sort by 局部排序

    • distribute by 和group by

  • 类型转换函数cast(field as int ),round(),floor()

索引

模式

  • 分区方案

  • 唯一键和标准化

  • 每个表的分区

调优
  • explain/explain externed 使用

  • limit限制调整

  • join 优化

  • 本地模式

  • 并行执行

  • 严格模式

  • 调整mapper和reducer个数

  • JVM重用

  • 动态分区数调整

  • 单个mr中多个group by

文件格式和压缩方法

  • 查看编解码器

  • 设置中间压缩
    set hive.exec.compress.intermediate=true

  • 设置输出压缩
    set hive.exec.compress.output=true

  • sequence file 格式
    支持NONE,RECORD,BLOCK等

hive函数

自定义hive文件格式

  • 设置表的文件格式:stored as 文件格式;

  • 文件格式

    • sequencefile

    • rcfile

  • 记录格式

    • serDe

    • CSV 和TSV

    • XML UDF

    • xpth

    • json serDe

hive thrift服务

  • hive server或hive thrift是一个基于thrift协议的组件,可以基于java,c++,python等语言远程访问;

  • 启动hive server

    • bin/hive --service hiveserver &
  • 管理hiveserver

    • httpproxy负载hiveserver
  • hive metastore

    • 启动metastore

    • bin/hive --service metastore &

hive 连接外部存储
  • hive连接hbase
方案一 建立 Hive 表,关联 HBase 表,插入数据到 Hive 表的同时能够影响 HBase 表
  • 创建连接外部表
    create table hive_hbase_emp_table(
    empno int,
    ename string comment ‘姓名’,
    job string,
    mgr int,
    hiredate string,
    sal double,
    deptno int
    )
    stored by ‘org.apache.hadoop.hive.hbase.HBaseStorageHandler’
    with serdeproperties(“hbase.columns.mapping”=":key,info:ename,info:job,info:mgr,info:hiredate,info:sal,info:deptno")
    TBLPROPERTIES (“hbase.table.name” = “hbase_emp_table”);

CREATE TABLE emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
deptno int)
row format delimited fields terminated by ‘,’;

load data local inpath ‘/data/hive/emp.txt’ into table emp;

insert into table hive_hbase_emp_table select * from emp;

SemanticException [Error 10101]: A non-native table cannot be used as target for LOAD

方案二:
在 HBase 中已经存储了某一张表 hbase_emp_table,
然后在 Hive 中创建一个外部表来 关联 HBase 中的 hbase_emp_table 这张表,使之可以借助 Hive 来分析 HBase 这张表中的数 据
create external table hive_crs_data_report(
rowkey string comment ‘查询主键’,
params string comment ‘请求参数’,
timestamp string comment ‘时间戳’,
dataClass string comment ‘类型’,
data string comment ‘数据’
)
stored by ‘org.apache.hadoop.hive.hbase.HBaseStorageHandler’
with serdeproperties(“hbase.columns.mapping”=":key,cf1:params,cf1:timestamp,cf1:dataClass,cf1:data")
TBLPROPERTIES (“hbase.table.name” = “crs_data_report_test_able”);

hive权限管理和锁
  • hive0.10后版本权限控制是基于元数据控制,之前版本是通过linux的用户组合用户控制

  • 使用Hive的元数据配置权限之前必须现在hive-site.xml中配置两个参数

    • hive.security.authorization.enabled:开启权限验证,默认为false

    • hive.security.authorization.createtable.owner.grants:参数是指表的创建者对表拥有所有权限

    • hive.security.authorization.enabled参数是开启权限验证,默认为false。
      hive.security.authorization.createtable.owner.grants参数是指表的创建者对表拥有所有权限

hive 和oozie整合
HCatalog
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值