Hive的安装

Laity小白

于 2022-08-22 10:13:01 发布

阅读量389

点赞数

文章标签： hive 数据库 hadoop

本文链接：https://blog.csdn.net/m0_51557895/article/details/126460093

版权

Hive

Hadoop生态下的数据仓库软件(工具)

Hive 安装

版本选择与Spark2.4.7兼容较好的1.2.2

1. 上传，解压
1. 安装MySQL用来替换hive自带的derby
1. 将mysql-connector-java.jar复制到Hive的lib目录中
1. 修改配置文件

hive-env.sh

# 进入对应的目录下
cd /opt/hive-1.2.2/conf
cp hive-env.sh.template hive-env.sh

# 编辑 hive-env.sh
HADOOP_HOME=/opt/hadoop-2.7.7
export HIVE_CONF_DIR=/opt/hive-1.2.2/conf

hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
        <name>hive.cli.print.header</name>
        <value>true</value>
        <description>hive命令行客户端中开启表头的打印</description>
</property>
<property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
        <description>Hive命令行客户端显示当前使用的库</description>
</property>
<property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://hadoop01:3306/metastore?createDatabaseIfNotExist=true</value>
        <description>设置存储Hive元数据的mysql的链接</description>
</property>
<property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
        <description>设置Driver类</description>
</property>
<property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
        <description>mysql用户名</description>
</property>
<property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>123456</value>
        <description>mysql密码</description>
</property>
</configuration>

1. 配置环境变量

echo 'export HIVE_HOME=/opt/hive-1.2.2' >> /etc/profile
echo 'export PATH=.:$HIVE_HOME/bin:$PATH' >> /etc/profile
source /etc/profile

1. 初始化Hive的metastore库

schematool -initSchema -dbType mysql

1. 启动

hive

Hive 的使用

DDL

库的操作

-- 显示数据库
show databases;
-- 过滤显示查询的数据库
show databases like 'db_*';
-- 查看数据库详情
-- 显示数据库信息
desc database db_hive;
-- 显示数据库详细信息，extended
desc database extended db_hive;
-- 使用数据库
use db_hive;
-- 删除空数据库
drop database db_hive2;
-- 如果删除的数据库不存在，最好采用 if exists判断数据库是否存在
drop database if exists db_hive2;
-- 如果数据库不为空，可以采用cascade命令，强制删除
drop database db_hive cascade;

表的操作

--       外部表             判断存在
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
--         Hive的类型
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
--   设置分区字段
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
--   分桶表
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
-- 行 格式化
[ROW FORMAT row_format]
-- 行 格式
ROW FORMAT
-- 限制    字段    结束   于  '\t'
DELIMITED [FIELDS TERMINATED BY char]
-- 集合     元素   结束      于   ''
[COLLECTION ITEMS TERMINATED BY char]
-- 映射 的 key 结束      于   ''
[MAP KEYS TERMINATED BY char]
-- 行  结束   于   '\n'
[LINES TERMINATED BY char]
-- 设置其他的序列化和反序列化方式 例如链接HBase表
   | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
-- 设置 文件 的存储格式
-- 默认使用textfile
-- orc parquet 列式存储的数据格式
[STORED AS file_format]
-- 建表时指定表的路径
[LOCATION hdfs_path]

对比不同格式的文件大小和查询速度

-- 创建一个text格式的表
create table text_log(line string);
-- 创建一个orc格式的表
create table orc_log(line string)
STORED AS orc;
-- 将数据加载到text表中
load data local inpath '/root/logs' into table text_log;
-- 将数据加载到orc表中
insert into orc_log  select * from text_log;
-- 查看HDFS文件大小
hdfs dfs -du -h -s /user/hive/warehouse/db3.db/text_log
hdfs dfs -du -h -s /user/hive/warehouse/db3.db/orc_log
-- 执行相同的查询语句，对比效率
select
      t2.ip ip,
      sum(t2.rcode_200) rcode_200,
      sum(t2.up) up,
      sum(t2.down) down
from
    (select
        t1.strs[0] ip,
        if(t1.strs[8]=="200",1,0) rcode_200,
        cast(t1.strs[9] as bigint) up,
        cast(t1.strs[size(t1.strs) - 1] as bigint) down
    from
        (select
         split(line," ")  strs
        from
         db3.orc_log) t1) t2
group by
    t2.ip;

DML

数据的导入
将数据插入或者批量导入到hive的表中
1. load

load data [local] inpath '/root/data/student.txt' [overwrite] into table student [partition (partcol1=val1,…)];
load data:表示加载数据
local:表示从本地加载数据到hive表；否则从HDFS加载数据到hive表
inpath:表示加载数据的路径
into table:表示加载到哪张表
student:表示具体的表
overwrite:表示覆盖表中已有数据，否则表示追加
partition:表示上传到指定分区

1. insert

通过查询语句向表中插入数据（Insert）
创建一张分区表
create table student(id string, name string) partitioned by (month string) row format delimited fields terminated by '\t';
基本插入数据
insert into table  student partition(month='202005') values(‘01',‘amos');
基本模式插入（根据单张表查询结果）
insert overwrite table student partition(month=‘202004')
         select id, name from student where month='202005';
多插入模式（根据多张表查询结果）
from student
          insert overwrite table student partition(month='202001')
          select id, name where month='202005'
          insert overwrite table student partition(month='202002')
          select id, name where month='202005';

1. as select 使用select语句作为表结构，直接创建表并插入数据

create table if not exists student3
as select id, name from student;

1. 直接操作HDFS
  需要注意Hive默认使用\u0001作为字段分隔符，
  所以如果上传的文件不是使用\u0001分割，
  需要在建表时提前设置row format中的字段分割符，
  否则可能只有第一列有数据，其他列都是NULL

创建表时通过Location指定加载数据路径
创建表，并指定在hdfs上的位置
 create table if not exists student5(
              id int, name string
              )
              row format delimited fields terminated by '\t'
              location '/user/hive/warehouse/student5';
上传数据到hdfs上
dfs -put /root/data/student.txt  /user/hive/warehouse/student5;
查询数据
select * from student5;