建表
create table if not exists tmp.tmp_table_name
TBLPROPERTIES('lifecycle'='90');
as
建分区表
CREATE TABLE if not exists dm_dtmart.tmp_table_name(
scene string comment '场景'
,uploadorder int comment '次序'
,driver_id bigint comment '司机ID'
,lon double comment '出发地经度'
) COMMENT '表中文名'
PARTITIONED BY (day int)
stored as parquet
TBLPROPERTIES('lifecycle'='90');
分区知识 df dc di
df 每日全量表,有day分区,day取最新分区
dc 最新的全量表,无day分区
di 每日增量表,限定日期,如 day between 20190415 and 20190416;
对于显示为全量表,但是有分区字段day的表格,比如dwd.dwd_user_active_df用户每日活跃表:筛选day=某天后,是否只记录了当天有行为的用户。如果想查询所有用户在某段时间内的行为,day就应该是一段时间了。
dwd.dwd_order_tp_order_df订单明细表,显示是全量表,但是有分区字段day,筛选day=某天后,这个表会包括所有历史订单的信息吗?。
@相关知识
数据格式 parquet/orc
Parquet:Parquet是一种优化的列式存储格式,它支持高效的压缩和列裁剪(Column Pruning),以减少I/O和提高查询性能。它适用于大规模数据分析和数据仓库场景。
ORC(Optimized Row Columnar):ORC是另一种优化的列式存储格式,它结合了行式存储和列式存储的优点,提供了更高的压缩率和查询性能。它适用于Hive中大型表和复杂查询的性能优化。
可变分区
create table if not exists dm_dtmart.dwd_soul_index_eventname (
cargo_id bigint comment '货源ID'
,driver_id bigint comment '司机ID'
,rn bigint comment 'gtr模型排序'
) comment '平台灵魂指标-回流结果中间表--排序明细表'
partitioned by (day bigint,event_name string)
stored as parquet;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
insert overwrite table dm_dtmart.dwd_soul_index_eventname partition(day,event_name)
select
cargo_id
,driver_id
,rn
,day
,event_name
from table_name
where day=${zdt.addDay(-1).format("yyyyMMdd")}
时间
https://blog.csdn.net/qq_34019697/article/details/109001902
今天:${zdt.add(11,-1).format("yyyyMMdd")}
昨天:${zdt.addDay(-1).format("yyyyMMdd")}
上一小时:hour = ${zdt.add(11,-1).format("HH")}
当前小时:
表重命名
alter table emp rename to emp2
表分区信息
show partitions table1
删除表
drop table if exists tmp.tmp_table_name
drop table if exists tmp.tmp_table_name_${zdt.add(11,-1).format("yyyyMMddHH")} ;
删除分区
alter table tmp.tmp_table_name drop if exists partition(day<${zdt.addDay(-30).format("yyyyMMdd")});
增加字段
ALTER TABLE tmp.tmp_table_name ADD COLUMNS (deal_within10 bigint comment '10以内成交量');
修改字段名称和类型
ALTER TABLE tmp.tmp_table_name change column end_place_id end_place_id string comment "搜索目的地id";
修改字段顺序
ALTER TABLE tmp.tmp_table_name change charge_name charge_name string after is_lcl_cargo;
插入列
insert overwrite table tmp.tmp_table_name partition(day=${zdt.addDay(-1).format("yyyyMMdd")})
文字处理
提取
select split(split('普通#0.67,平板#0.17,高栏#0.17', ',')[0],'#')[0]
select LEFT ('sada', LOCATE( '|','ada')-1)
select substr('sada',0,35)
@相关知识
hive和spark的取值不同:hive--substr(truck_no,1,4);spark--substr(truck_no,1,2)
位置
select LOCATE( '|','ada')
select length('sad')
select instr('13.0#0.5,17.5#0.5', '13.0')
格式转换
select cast(b.regis_trucklen as string)
字符匹配
city in ('徐州市','宿迁市','淮安市')
reason like '%司机%'
正则匹配
truck_length rlike '上海市|南通市|嘉兴市|常州市|无锡市|泰州市|湖州市|苏州市|镇江市'
concat(',',recall_types,'') regexp concat(',',replace('1,2,1a,1b,1c,1s,n4,1cr',',',',|',),',') as is_accute
json解析
get_json_object(cargo_tags,'$.page_floor')
regexp_extract(regexp_replace(record,'\"',''),'cargoId:([0-9]+)(,)',1) as cargo_id
替换
select regexp_replace(cargo_id,'\\[','') as cargo_id
多列拼接
select concat('sad',null,'shauo'),任一列带有空值,连接结果也为空
列转行
select concat_ws(',',sort_array(collect_set(end_city_name))) end_city_list
行转列
select
user_id,ex_start_city,ex_end_city
from dm_trade.cargo_search_use_detail_di
lateral VIEW explode(split(start_city, ',')) tmp AS ex_start_city
lateral VIEW explode(split(end_city, ',')) tmp AS ex_end_city
where day = 20210319
and user_id=96436391685966028
首个非空
SELECT COALESCE(NULL, NULL, NULL, 'W3Schools.com', NULL, 'Example.com');
非空
SELECT nvl(NULL,'Example.com');
空值
isnull('') =false;= null不存在
日期格式化
--取年月日
substr(event_time,1,10)
date(event_time)
--取年和月
DATE_FORMAT(start_time, "%Y-%m")
--取小时
HOUR(event_time)
year(start_time) = 2021
--文本转时间戳
unix_timestamp(cast(20210813 as string),'yyyymmdd')
--时间戳转文本
from_unixtime(1610467680):返回Unix时间标记的一个字符串,根据format格式化。如果format为空默认会使用%Y-%m-%d %H:%i:%s的格式
--时间戳之差
timestampdiff(second,st,et)
--星期几,从周日开始为1
select date_format(from_unixtime(unix_timestamp(cast(20210813 as string),'yyyymmdd'),'yyyy-mm-dd'),"u") as p
--相差天数
SELECT DATEDIFF('2021-01-13 00:08:00', '2022-01-24 00:08:00') AS DateDiff 前减后
SELECT datediff('2021-01-01',substr(max(max_event_time),0,10)) as interval_from_now
--日期介于
select if('2020-12-21' between '2020-01-01' and '2020-12-31',1,0)
--从某一时间往前的N天
select date_sub('2021-03-15',interval 5 day)
select date_sub(now(),interval 5 minute)
--筛选工作日
DAYOFWEEK(event_time) BETWEEN 2 AND 6
--日期转周几?
date_format(event_time,'%W') not in ('Saturday','Sunday')
窗口函数
max(hour)over(partition by day)
select
ROW_NUMBER() OVER(PARTITION BY user_id ORDER BY time) AS rank1
LAG(time,1) OVER(PARTITION BY user_id ORDER BY time) AS nexttime,
LEAD(time,1) OVER(PARTITION BY user_id ORDER BY time) AS pretime,
from tmp.tmp_A1020916_20210201_driverinfo_202101_1 tb_driverinfo
where tb_driverinfo.truck_no is not null
条件
case when user_id is not null then 'Y' else 'N' end as driver_truck_match,
统计
sum(if(fenlei='分类2',1,0 )),sum(if(fenlei='分类3',1,0 ))
报错
1、SemanticException TOK_ALLCOLREF is not supported in current context
原因:执行distinct *时报错,hive不支持。更换spark可以
2、is null和=null
原因:
分位数
percentile_approx(value,array(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), 9999) as p10_90
percentile(value,0.5) as p50
分组
select ntile(3) over (partition by a order by score desc) as group,
name,
score,
province
from student
已知边长计算角度
input a,b,c
X = (b*b+c*c-a*a)/(2*b*c)
Y = (a ^ 2 + c ^ 2 - b ^ 2) / 2 * a * c
Z = (a ^ 2 + b ^ 2 - c ^ 2) / 2 * a * b
jA = Atn(-X / Sqr(-X * X + 1)) + 2 * Atn(1)
jB = Atn(-Y / Sqr(-Y * Y + 1)) + 2 * Atn(1)
jC = Atn(-Z / Sqr(-Z * Z + 1)) + 2 * Atn(1)
atn--反三角函数
取整
floor(12)
保留N位小数
ROUND(column_name, 2)
快速关联表
USING
select a.*,b.col
from a
left join b
on a.t=b.t
等价于
select a.*,b.col
from a left join b using t
05-27
8487
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)
09-06