建表1 使用默认配置:
create table test1
(
userid string,
stat_date string,
tryvv int,
sucvv int,
ptime float
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' // 必选;指定列之间的分隔符
STORED AS textfile // 非必选;指定文件的读取格式,默认textfile格式
location '/testdata/';
建表2,指定文件存储格式。
create table test2
(
userid string,
stat_date string,
tryvv int,
sucvv int,
ptime float
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS rcfile; //指定存储格式为rcfile
建表3:指定存储路径:
create table test3
(
userid string,
stat_date string,
tryvv int,
sucvv int,
ptime float
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/test/';
建表4:比较常用
create table dianxin_as_S AS select * from dianxin_503 limit 10;
注意新建表不允许是外部表。
分区:
概念:就是hive表进行再一次分级。相当于目录下的子目录,子目录的名称以分区名称命名。可以避免
全表扫描,提高查询效率。
用法:一般企业里hive表,基本都是分区表。
建表语句:
CREATE TABLE page_view(viewTime INT, ip STRING )
PARTITIONED BY (dt STRING, country STRING) //指定分区字段,如果有多个字段,它从左到右一级一级嵌套。
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
CREATE TABLE page_view(viewTime INT, ip STRING )
PARTITIONED BY (dt STRING) //dt 叫分区字段,也叫分区列。
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
//添加一个分区
alter table page_view add partition(dt='20181211');
//从其他表加载数据时候创建分区
insert into table page_view partition(dt='20181012') select cast(city_id as int)
,city_name from shangdan.city_id;
//一般分区是按照日期命名进行分区。
动态分区:
insert into table anhui_air partition (dt) select part,pm25,date_time from anhui_air2;
开启后,查询的语句中最后的对应的字段作为动态分区的字段,会按照这个字段进行分区,相同的值会放到同一个分区
hive加载数据:加载数据不会校验数据格式,实际上叫读时模式。当查询的时候才会校验。
1. 从hdfs 上加载数据: load data inpath '/data/dianxin_data' [overwrite] into table dianxin_p partition(dt='20181210)
建表:load data inpath '/data/dianxin_data' into table dianxin_p partition(dt='20181210')
注意:这种方式加载数据,是将数据移动到表所在的hdfs目录下。
create table dianxin_p (
mdn string COMMENT '用户手机号码',
grid_id string COMMENT '停留点所在电信内部网格号',
city_id string COMMENT '业务发生城市id',
county_id string COMMENT '停留点区县',
duration string COMMENT '机主在停留点停留的时间长度(分钟),lTime-eTime',
grid_first_time string COMMENT '网格第一个记录位置点时间(秒级)',
grid_last_time string COMMENT '网格最后一个记录位置点时间(秒级)',
day_id string COMMENT '天分区')
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile;
2 从本地加载数据
load data local inpath '/usr/local/testdata/dianxin_data' into table dianxin_p partition(dt='20181211')
3从另外一个表加载数据:
表对表加载:
create table IF NOT EXISTS dianxin_test2 as select * from dianxin_test
insert [overwrite] into table dianxin_test2 select * from dianxin_test;
4 建表指定存储位置
create table dianxin_l (
mdn string COMMENT '用户手机号码',
grid_id string COMMENT '停留点所在电信内部网格号',
city_id string COMMENT '业务发生城市id',
county_id string COMMENT '停留点区县',
duration string COMMENT '机主在停留点停留的时间长度(分钟),lTime-eTime',
grid_first_time string COMMENT '网格第一个记录位置点时间(秒级)',
grid_last_time string COMMENT '网格最后一个记录位置点时间(秒级)',
day_id string COMMENT '天分区')
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile
location '/bigdata/';
注意 location 只能是文件目录,不能是具体的文件。
数据类型:
create table test5
(
date_tine date,
time_stasm Timestamp
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;
外部表和内部表:
create EXTERNAL table test_EX
(
date_tine date,
time_stasm Timestamp
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
location '/exdata/';
外部表:删除表,数据不会删除。内部表删除表,数据会删除。
外部表其他功能:可以映射外部数据源。如hbase,es。
如:
CREATE EXTERNAL TABLE hive_oss_user_label_action_data(
key string,
monthno string,
usernumber string,
labelno string,
labelvalue string,
provcode string,
areacode string,
cardtype string,
extstring string,
createtime string,
modifytime string
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES
("hbase.columns.mapping" =
":key,info:monthno,info:usernumber,info:labelno,info:labelvalue,info:provcode,info:areacode,info:cardtype,info:extstring,info:createtime,info:modifytime")
TBLPROPERTIES("hbase.table.name" = "oss_user_label_action_data");
JOIN:
左连会按照左表匹配关联,不管是否能关联上,左边全部输出,没有匹配上的右边表,为null。
select a.* from A a left join B b on a.id=b.id where b.id is not null
排序:
select * from dianxin_test distribute by city_id sort by duration;
去重:
select distinct city_id from country_info_table limit 100;
函数:
if函数:if(条件表达式,如果条件成立返回值,如果不成立返回值)
select mdn,if(duration='0',null,duration) from dianxin_test;
case when函数:
select mdn,case when duration='0' then null else duration end from dianxin_test;
select if(mdn='47BE1E866CFC071DB19D5E1C056BE28AE24C16E7','352781814',city_id) as num,avg(duration) from dianxin_tes
t group by if(mdn='47BE1E866CFC071DB19D5E1C056BE28AE24C16E7','352781814',city_id);
单词统计sql:select w.word,count(*) from (select explode(split(line,' ')) as word from wc_test) w group by w.word;
窗口函数之 row_number() over 作用:给每个分组数据增加行号。一般用于求组内topN。
需求:每。。。。。。。前几名。。。。。。。:
select * from (select *,row_number() over(partition by mdn order by duration desc) as rn from dianxin_test) w where
w.rn <= 2;
行转列:
SELECT name,new_num FROM laterv_tb LATERAL VIEW explode(weight) num AS new_num;
列转行:
select collect_set(city_id) from country_info_table;
collect_list不去重
解决数据倾斜方法之一:
select city_id,count(*) from data_qx group by city_id;
解决方法sql:
select a.city_id,sum(a.cnt) from (
select if(city_id='83401',hash(rand()),0),city_id, count(*) as cnt from data_qx group by city_id,if(city_id='83401',hash(rand()),0)) a group by a.city_id;;
可以参考:https://help.aliyun.com/document_detail/51020.html?spm=a2c4g.11186623.6.861.1a064ef8J2CRds