-- 创建并使用数据库
create database db_0625;
use db_0625;
-- 创建内部表
create table if not exists t_0625(sid int,sname string)
row format delimited fields terminated by ',' stored as textfile;
-- 创建外部表
create external table if not exists t_0625_ext(pageid int,page_url string comment 'URL')
row format delimited fields terminated by ',' location 'hdfs://localtion:9000/user/hive/warehouse/';
-- 创建分区表
create table if not exists invites(id int,name string) partitioned by (ds string)
row format delimited fields terminated by ',' lines terminated by '\n' stored as textfile;
-- 创建带桶的表
create table student(id int,age int,name string) partitioned by(stat_date string)
clustered by(id) stored by(age) into 2 buckets
row format delimited fields terminated by ',';
-- 增加分区
alter table TableName add partition(part='a') partition(part='b')
-- 重命名表
alter table TableName RENAME TO NewTableName
-- 增加/更新列
alter table TableName ADD|REPLACE COLUMNS(col_name,data_type)
alter table TableName CHANGE [COLUMN] col_old_name col_new_name colu...
-- Load
load data local inpath 'a.txt' into table student partition(stat_date='20180630')
load data local inpath '/home/hadoop/a.txt' into table student partition(stat_date='20180630')
load data inpath 'hdfs://localhost:9000/user/' ...
load data local inpath 'a.txt' overwrite into table student part...
-- insert
基本插入模式
多插入模式
自动分区模块
导出
insert overwrite local directory '/home/hadoop/' select * from student;
insert overwrite directory 'hdfs:..' select * from student;
-- select
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]
]
[LIMIT number]
1、order by 会对输入做全局排序,因此只有一个reducer,会导致当输入规模较大时,需要较长的计算时间。
2、sort by不是全局排序,其在数据进入reducer前完成排序。因此,如果用sort by进行排序,并且设置mapred.reduce.tasks>1,则sort by只保证每个reducer的输出有序,不保证全局有序。
3、distribute by(字段)根据指定的字段将数据分到不同的reducer,且分发算法是hash散列。
4、Cluster by(字段) 除了具有Distribute by的功能外,还会对该字段进行排序。
-- join
join_table:
table_reference JOIN table_factor [join_condition]
| table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference join_condition
| table_reference LEFT SEMI JOIN table_reference join_condition
Hive 支持等值连接(equality joins)、外连接(outer joins)和(left/right joins)。
Hive 不支持非等值的连接,因为非等值连接非常难转化到 map/reduce 任务。
另外,Hive 支持多于 2 个表的连接。
-- 内置函数
create table dual(id string);
load data local inpath '/Users/yuanhua/t_0625.txt' into table dual;
select substr('helloworld',2,3) from dual;
-- 显示当前有多少函数可用
show functions;
-- 显示函数的描述信息
desc functions concat;
-- 显示函数的用法
desc functions extended concat;
create table employee(id string,money double,type string) row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile;
load data local inpath '/Users/yuanhua/b.txt' into table employee;
select case when id='1001' then 'v1001' when id='1002' then 'v1002' else 'v1003' end from employee;
## hive、sparksql
1.列转行 lateral view explode(split(column, ',')) num
2.行转列 concat_ws(',',collect_set(column))
说明:collect_list 不去重,collect_set 去重。 column 的数据类型要求是 string
-- 窗口函数:rank() dense_rank() row_number()
rank() OVER(PARTITION BY advus_usid ORDER BY advrr_rechargetime DESC) as rk
dense_rank() OVER(PARTITION BY advus_usid ORDER BY advrr_rechargetime DESC) as drk
row_number() over (distribute by advus_usid sort BY advrr_rechargetime desc) as rn
三者区别如下表,按相同name的num排序
name num rank dense_rank row_number
yuanhua 6 1 1 1
yuanhua 2 2 2 2
yuanhua 2 2 2 3
yuanhua 1 4 3 4
-- datediff,date_add、date_sub
1.日期比较函数: datediff语法: datediff(string enddate,string startdate)
返回值: int
说明: 返回结束日期减去开始日期的天数。
select datediff('2016-12-30','2016-12-29'); # 1
2.日期增加函数: date_add语法: date_add(string startdate, intdays)
返回值: string
说明: 返回开始日期startdate增加days天后的日期。
select date_add('2016-12-29',10); # 2017-01-08
3.日期减少函数: date_sub语法: date_sub (string startdate,int days)
返回值: string
说明: 返回开始日期startdate减少days天后的日期。
select date_sub('2016-12-29',10); # 2016-12-19