hive笔记

最新推荐文章于 2024-06-25 15:10:27 发布

何∑

最新推荐文章于 2024-06-25 15:10:27 发布

阅读量175

点赞数

分类专栏：笔记 java Mysql python 日志文章标签： java 大数据数据库 hive 笔记

本文链接：https://blog.csdn.net/lp82lf/article/details/92763344

版权

笔记同时被 3 个专栏收录

8 篇文章 0 订阅

订阅专栏

日志

8 篇文章 0 订阅

订阅专栏

python

6 篇文章 0 订阅

订阅专栏

建表1 使用默认配置：
create table test1
(
userid string,
stat_date string,
tryvv int,
sucvv int,
ptime float
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' // 必选;指定列之间的分隔符
STORED AS textfile // 非必选；指定文件的读取格式，默认textfile格式
location '/testdata/';
建表2，指定文件存储格式。
create table test2
(
userid string,
stat_date string,
tryvv int,
sucvv int,
ptime float
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS rcfile; //指定存储格式为rcfile

建表3：指定存储路径：
create table test3
(
userid string,
stat_date string,
tryvv int,
sucvv int,
ptime float
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
location '/test/';

建表4：比较常用
create table dianxin_as_S AS select * from dianxin_503 limit 10;
注意新建表不允许是外部表。

分区：
概念：就是hive表进行再一次分级。相当于目录下的子目录，子目录的名称以分区名称命名。可以避免
全表扫描，提高查询效率。
用法：一般企业里hive表，基本都是分区表。
建表语句：
CREATE TABLE page_view(viewTime INT, ip STRING )
PARTITIONED BY (dt STRING, country STRING) //指定分区字段，如果有多个字段，它从左到右一级一级嵌套。
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'

CREATE TABLE page_view(viewTime INT, ip STRING )
PARTITIONED BY (dt STRING) //dt 叫分区字段，也叫分区列。
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'

//添加一个分区
alter table page_view add partition(dt='20181211');
//从其他表加载数据时候创建分区
insert into table page_view partition(dt='20181012') select cast(city_id as int)
,city_name from shangdan.city_id;
//一般分区是按照日期命名进行分区。
动态分区：
insert into table anhui_air partition (dt) select part,pm25,date_time from anhui_air2;
开启后，查询的语句中最后的对应的字段作为动态分区的字段，会按照这个字段进行分区，相同的值会放到同一个分区

hive加载数据：加载数据不会校验数据格式，实际上叫读时模式。当查询的时候才会校验。
1. 从hdfs 上加载数据: load data inpath '/data/dianxin_data' [overwrite] into table dianxin_p partition(dt='20181210)
建表：load data inpath '/data/dianxin_data' into table dianxin_p partition(dt='20181210')
注意：这种方式加载数据，是将数据移动到表所在的hdfs目录下。
create table dianxin_p (
mdn string COMMENT '用户手机号码',
grid_id string COMMENT '停留点所在电信内部网格号',
city_id string COMMENT '业务发生城市id',
county_id string COMMENT '停留点区县',
duration string COMMENT '机主在停留点停留的时间长度（分钟）,lTime-eTime',
grid_first_time string COMMENT '网格第一个记录位置点时间（秒级）',
grid_last_time string COMMENT '网格最后一个记录位置点时间（秒级）',
day_id string COMMENT '天分区')
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile;
2 从本地加载数据
load data local inpath '/usr/local/testdata/dianxin_data' into table dianxin_p partition(dt='20181211')

3从另外一个表加载数据：
表对表加载：
create table IF NOT EXISTS dianxin_test2 as select * from dianxin_test
insert [overwrite] into table dianxin_test2 select * from dianxin_test;
4 建表指定存储位置
create table dianxin_l (
mdn string COMMENT '用户手机号码',
grid_id string COMMENT '停留点所在电信内部网格号',
city_id string COMMENT '业务发生城市id',
county_id string COMMENT '停留点区县',
duration string COMMENT '机主在停留点停留的时间长度（分钟）,lTime-eTime',
grid_first_time string COMMENT '网格第一个记录位置点时间（秒级）',
grid_last_time string COMMENT '网格最后一个记录位置点时间（秒级）',
day_id string COMMENT '天分区')
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' stored as textfile
location '/bigdata/';
注意 location 只能是文件目录，不能是具体的文件。

数据类型：

create table test5
(
date_tine date,
time_stasm Timestamp
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;

外部表和内部表：
create EXTERNAL table test_EX
(
date_tine date,
time_stasm Timestamp
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
location '/exdata/';

外部表：删除表，数据不会删除。内部表删除表，数据会删除。
外部表其他功能：可以映射外部数据源。如hbase，es。
如：
CREATE EXTERNAL TABLE hive_oss_user_label_action_data(
key string,
monthno string,
usernumber string,
labelno string,
labelvalue string,
provcode string,
areacode string,
cardtype string,
extstring string,
createtime string,
modifytime string
)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES
("hbase.columns.mapping" =
":key,info:monthno,info:usernumber,info:labelno,info:labelvalue,info:provcode,info:areacode,info:cardtype,info:extstring,info:createtime,info:modifytime")
TBLPROPERTIES("hbase.table.name" = "oss_user_label_action_data");

JOIN：
左连会按照左表匹配关联，不管是否能关联上，左边全部输出，没有匹配上的右边表，为null。

select a.* from A a left join B b on a.id=b.id where b.id is not null

排序：
select * from dianxin_test distribute by city_id sort by duration;
去重：
select distinct city_id from country_info_table limit 100;
函数：
if函数：if(条件表达式，如果条件成立返回值，如果不成立返回值)
select mdn,if(duration='0',null,duration) from dianxin_test;
case when函数：
select mdn,case when duration='0' then null else duration end from dianxin_test;

select if(mdn='47BE1E866CFC071DB19D5E1C056BE28AE24C16E7','352781814',city_id) as num,avg(duration) from dianxin_tes
t group by if(mdn='47BE1E866CFC071DB19D5E1C056BE28AE24C16E7','352781814',city_id);

单词统计sql：select w.word,count(*) from (select explode(split(line,' ')) as word from wc_test) w group by w.word;
窗口函数之 row_number() over 作用：给每个分组数据增加行号。一般用于求组内topN。
需求：每。。。。。。。前几名。。。。。。。：
select * from (select *,row_number() over(partition by mdn order by duration desc) as rn from dianxin_test) w where
w.rn <= 2;

行转列：
SELECT name,new_num FROM laterv_tb LATERAL VIEW explode(weight) num AS new_num;
列转行：
select collect_set(city_id) from country_info_table;
collect_list不去重

解决数据倾斜方法之一：

select city_id,count(*) from data_qx group by city_id;

解决方法sql：
select a.city_id,sum(a.cnt) from (
select if(city_id='83401',hash(rand()),0),city_id, count(*) as cnt from data_qx group by city_id,if(city_id='83401',hash(rand()),0)) a group by a.city_id;;
可以参考：https://help.aliyun.com/document_detail/51020.html?spm=a2c4g.11186623.6.861.1a064ef8J2CRds

何∑

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
hive笔记

建表1 使用默认配置：create table test1(userid string,stat_date string,tryvv int,sucvv int,ptime float)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' // 必选;指定列之间的分隔符STORED AS textfile // 非必选；指定文件...
复制链接

扫一扫