Marsm的博客

每天进步一点点

hive基本语法

1.Hive中数据库的基本操作

1.1 Create/Drop

CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
  [COMMENT database_comment]
  [LOCATION hdfs_path]
  [WITH DBPROPERTIES (property_name=property_value, ...)];
  • create database
create database db_hive_01;
## 标准
create database if not exists db_hive_02;    
##创建数据库并制定存放目录
create database if not exists db_hive_03 
location '/user/weblog/hive/warehouse/db_hive_03.db';
  • 查看数据库
show databases;
## 查看有几个数据库
show databases like 'db_hive*'; 
desc database db_hive_01;
  • 删除数据库
drop database db_hive_01;
## 数据库中有表用cascade级联删除
drop database  db_hive_01 cascade; 

1.2 create table

官方建表语法:
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
  [COMMENT table_comment]
  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
  [SKEWED BY (col_name, col_name, ...)                  -- (Note: Available in Hive 0.10.0 and later)]
     ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
     [STORED AS DIRECTORIES]
  [
   [ROW FORMAT row_format] 
   [STORED AS file_format]
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]  -- (Note: Available in Hive 0.6.0 and later)
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (property_name=property_value, ...)]   -- (Note: Available in Hive 0.6.0 and later)
  [AS select_statement];   -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)

CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE existing_table_or_view_name
  [LOCATION hdfs_path];

data_type
  : primitive_type
  | array_type
  | map_type
  | struct_type
  | union_type  -- (Note: Available in Hive 0.7.0 and later)

primitive_type
  : TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
  | STRING
  | BINARY      -- (Note: Available in Hive 0.8.0 and later)
  | TIMESTAMP   -- (Note: Available in Hive 0.8.0 and later)
  | DECIMAL     -- (Note: Available in Hive 0.11.0 and later)
  | DECIMAL(precision, scale)  -- (Note: Available in Hive 0.13.0 and later)
  | DATE        -- (Note: Available in Hive 0.12.0 and later)
  | VARCHAR     -- (Note: Available in Hive 0.12.0 and later)
  | CHAR        -- (Note: Available in Hive 0.13.0 and later)

array_type
  : ARRAY < data_type >

map_type
  : MAP < primitive_type, data_type >

struct_type
  : STRUCT < col_name : data_type [COMMENT col_comment], ...>

union_type
   : UNIONTYPE < data_type, data_type, ... >  -- (Note: Available in Hive 0.7.0 and later)

row_format
  : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
        [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
        [NULL DEFINED AS char]   -- (Note: Available in Hive 0.13 and later)
  | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]

file_format:
  : SEQUENCEFILE
  | TEXTFILE    -- (Default, depending on hive.default.fileformat configuration)
  | RCFILE      -- (Note: Available in Hive 0.6.0 and later)
  | ORC         -- (Note: Available in Hive 0.11.0 and later)
  | PARQUET     -- (Note: Available in Hive 0.13.0 and later)
  | AVRO        -- (Note: Available in Hive 0.14.0 and later)
  | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname

constraint_specification:
  : [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ]
    [, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE
  • 内部表的创建
##创建表有三种方法:
create table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)
row format delimited fields TERMINATED by '\001'
stored as textfile;

-- 或者使用select创建
##该方式及复制表结构又加载数据
##注意:该方法创建的表无分区,select的表中的分区字段会被加载到目标表中
create table if not exists default.student_tmp
select sno,sname,sex,sage from default.student;

## 或者使用like创建表
create table if not exists default.student_like
like default.student;
  • 外部表的创建
    • 数据文件存储在指定的hdfs目录下,删除表数据不会从文件系统中删除
create external table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)
row format delimited fields TERMINATED by '\001'
location 'hdfs_location';
  • 创建分区表
    • 分区表实际上就是对应一个HDFS文件系统上的独立的文件夹,该文件夹下是该分区所有的数据文件。Hive中的分区就是分目录,把一个大的数据集根据业务需要分割成更小的数据集。
    • 在查询是通过where子句中的表达式来选择查询所需要的指定的分区,这样的查询效率会提高很多。
create external table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)partitioned by(prov_id string)
row format delimited fields TERMINATED by '\001'
location 'hdfs_location';
  • 分桶排序表
create external table if not exists default.student(
sno string comment '学号',
sname string,
sex string,
sage int,
sdept string comment '所在班级'
)partitioned by(prov_id string)
clustered by(sno) sorted by(sage) into 4 buckets
row format delimited fields TERMINATED by '\001'
location 'hdfs_location';

2. hive数据加载或导出

2.1 hive表加载数据

测试数据:

emp表
7369    SMITH   CLERK   7902    1980-12-17  800     20
7499    ALLEN   SALESMAN    7698    1981-2-20   1600    300 30
7521    WARD    SALESMAN    7698    1981-2-22   1250    500 30
7566    JONES   MANAGER 7839    1981-4-2    2975        20
7654    MARTIN  SALESMAN    7698    1981-9-28   1250    1400    30
7698    BLAKE   MANAGER 7839    1981-5-1    2850        30
7782    CLARK   MANAGER 7839    1981-6-9    2450        10
7788    SCOTT   ANALYST 7566    1987-7-13   3000        20
7839    KING    PRESIDENT       1981-11-17  5000        10
7844    TURNER  SALESMAN    7698    1981-9-8    1500    0   30
7876    ADAMS   CLERK   7788    1987-7-13   1100        20
7900    JAMES   CLERK   7698    1981-12-3   950     30
7902    FORD    ANALYST 7566    1981-12-3   3000        20
7934    MILLER  CLERK   7782    1982-1-23   1300        10
LOAD DATA [LOCAL] INPATH 'filepath' 
[OVERWRITE] INTO TABLE tablename 
[PARTITION (partcol1=val1, partcol2=val2 ...)]
  • load data [local] inpath ‘filepath’ [overwrite] into table tablename partition (partcol1=val1,…);
  • 原始文件的存储位置:[local]
    • 本地 local
    • hdfs
  • 对表的数据是否覆盖:[overwrite]
    • 覆盖 overwrite
    • 追加
  • 分区表加载,特殊性

    • partition (partcol1=val1,…)
  • 加载文件到hive表:

## 加载本地文件到hive表
load data local inpath '/home/hadoop/hivedata/emp.txt' into table default.emp;

## 加载HDFS文件到Hive
load data inpath '/user/weblog/datas/emp.txt' into table default.emp;

## 加载数据并覆盖表中已有的数据
load data inpath '/user/weblog/datas/emp.txt' overwrite into table default.emp;

##通过inster加载
insert [overwrite] into table default.emp select * from default.emp

##注意:以上方法在内部表中使用,外部表一般在创建表的时候就在location中指定好了数据路径;

2.2 hive表导出数据

##导出数据到本地目录
insert overwrite local directory '/home/hadoop/hive_emp' 
select * from default.emp;

##导出数据到本地目录,并按指定分隔符分隔字段
insert overwrite local directory '/home/hadoop/hive_emp'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' COLLECTION ITEMS TERMINATED BY '\n'
select * from default.emp;

##查询结果重定向到指定文件
hive -e 'select * from default.emp;' > /home/hadoop/hive_emp/emp.txt

3. Hive中的查询语句

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
  FROM table_reference
  [WHERE where_condition]
  [GROUP BY col_list]
  [ORDER BY col_list]
  [CLUSTER BY col_list
    | [DISTRIBUTE BY col_list] [SORT BY col_list]
  ]
 [LIMIT [offset,] rows]
  • max、min、count、sum、avg
select count(*) cnt from emp;
select max(sal) max_sal from emp;
select min(sal) min_sal from emp;
select sum(sal) from emp;
select avg(sal) from emp;
  • group by having
##分组:
##每个部门的平均工资
select deptno,avg(sal) avg_sal from emp group by deptno ;
##每个部门中每个岗位的最高薪水
select deptno,job,max(sal) max_sal from emp group by deptno,job;
##每个部门的平均薪资大于2000的部门
select deptno,avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;

4. hive查看数据库及表信息

  • 查看数据库
show databases;
  • 查看某个数据库中所有的表
use database;
show tables;
  • 查看表结构
##查看表字段信息
desc tablename;

##查看表详细信息
desc formatted tablename;

##查看表的建表语句
show create table tablename;

5. Order, Sort, Cluster, and Distribute By区别

  • order by 全局数据的排序,仅仅只有一个reduce
select * from emp order by empno desc;
  • sort by 对每个reduce内部数据进行排序,对全局的结果集不是排序的
##实现reduce内部排序先要设置reduce task个数
set mapreduce.job.reduces=<number>
select * from emp sort by empno asc;
##将结果集写入到本地文件中查看效果
insert overwrite local directory '/home/hadoop/hivedata/sortby' 
select * from emp sort by empno asc;
  • distribute by
    • 类似于MapReduce中分区partition,对数据进行分区,结合soat by 进行使用
insert overwrite local directory '/home/hadoop/hivedata/distby' 
select * from emp distribute by deptno sort by empno asc;
##注意事项:distribute by 必须要在 sort by 前面
  • cluster by 当distribute by和sort by字段相同时,可以使用cluster by;
insert overwrite local directory '/home/hadoop/hivedata/clusterby' 
select * from emp cluster by empno;
阅读更多
个人分类: hive
想对作者说点什么? 我来说一句

Hive用户指南

2015年03月26日 499KB 下载

没有更多推荐了,返回首页

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭