- like与rlike区别:like 是通配符,rlike是正则
- 取消hive一些语法限制:set hive.mapred.mode=nonstrict;
- 设置Reduce-Task(增加运行速度):set mapred.reduce.tasks = 30;
- count(*)计算的时候包含了NULL值,而count(expr)则不包含空值,例如:
select count(*) from (select disctinct column from xxx) x
Select count(distinct column) from xxx
结果不一样,因为count(distinct column)已经去null了
- 建表语句
use mid_trafficwisdom;
drop table if exists city_with_station;
create table if not exists city_with_station(
start_city_id bigint comment '起始城市ID',
start_city_name string comment '起始城市名',
end_city_id bigint comment '到达城市ID',
end_city_name string comment '到达城市名',
com_start_city_id bigint comment '公共起始城市ID',
com_start_city_name string comment '公共起始城市名',
com_end_city_id bigint comment '公共到达城市ID',
com_end_city_name string comment '公共到达城市名',
traffic_type string comment '交通类型',
section_station_set string comment '站到站集合',
create_time string comment '数据插入时间'
)comment '城市包含站-站集合'
location '/data/train/trafficwisdom/mid/city_with_station'
- 插入分区表
INSERT OVERWRITE TABLE mid_trafficwisdom.search_transfer_log PARTITION(push_date ='2019-01-29' )
SELECT from_city_name,to_city_name,start_date,traffic_type,from_station_name,to_station_name,from_station_code,to_station_code,send_time
from tmp_trafficwisdom.search_transfer_log_tmp where from_city_name is not null and to_city_name is not null
- 删除分区表:
ALTER TABLE mid_trafficwisdom.order_train_log DROP PARTITION (log_date = '2019-01-29');
//批量删除
ALTER TABLE ota_interface_check_info_log DROP PARTITION(log_date <='2017-08-10')
修改表名
ALTER TABLE name RENAME TO new_name
- 分区表批量插入分区表
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE mid_trafficwisdom.ml_cross_data PARTITION (log_date)
SELECT D.userid, D.city, D.category, D.from_place, D.to_place
, D.isclick, D.future_day, D.hour_time, D.banner_min_price, D.banner_min_time
, D.start_city_id, D.start_city_name, D.end_city_id, D.end_city_name, D.push_date
FROM tmp_trafficwisdom.ml_cross_data D
- 修改表字段
ALTER TABLE app_trafficwisdom.route_traffic_transfer_without_date CHANGE transfer_overtime transfer_overtime bigint comment '第一段途径时间'
- ROW_NUMBER使用
SELECT transport_code, start_station_code, start_station_name, end_station_code, end_station_name
, run_time, price, create_date
FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY transport_code, start_station_code, end_station_code ORDER BY create_date DESC) AS rn
FROM mid_trafficwisdom.section_ticket_price
WHERE traffic_type = 'F'
) A WHERE A.rn = 1
- 一行拆多行(explode)
SELECT *
FROM (
SELECT from_city_name, to_city_name, COUNT(*) AS search_num
, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
FROM mid_trafficwisdom.search_line_log
WHERE push_date = '2019-01-29'
AND from_city_name IS NOT NULL
AND to_city_name IS NOT NULL
GROUP BY from_city_name, to_city_name
) d
LATERAL VIEW explode(traffic_types) adTable AS traffic_type limit 100;
多列合并一列
SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
, end_station_name, seat_infos.seat_name, seat_infos.seat_price
FROM (
SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
, end_station_name
, array(
named_struct('seat_name', '硬座', 'seat_price', hard_seat_price),
named_struct('seat_name', '无座', 'seat_price', no_seat_price),
named_struct('seat_name', '软卧上', 'seat_price', soft_sleeper_up_price),
named_struct('seat_name', '软卧下', 'seat_price', soft_sleeper_down_price),
named_struct('seat_name', '硬卧上', 'seat_price', hard_sleeper_up_price),
named_struct('seat_name', '硬卧中', 'seat_price', hard_sleeper_mid_price),
named_struct('seat_name', '硬卧下', 'seat_price', hard_sleeper_down_price),
named_struct('seat_name', '二等座', 'seat_price', second_seat_price),
named_struct('seat_name', '一等座', 'seat_price', first_seat_price),
named_struct('seat_name', '商务座', 'seat_price', business_seat_price)) AS seat_info
FROM mid_trafficwisdom.ticket_train_log
WHERE duration_date >= '{today,yyyy-MM-dd}'
) M
LATERAL VIEW explode(seat_info) seat_info AS seat_infos
WHERE seat_infos.seat_price IS NOT NULL AND seat_infos.seat_price > 0;
- 数据倾斜设置
set hive.groupby.mapaggr.checkinterval=100000 ;--这个是group的键对应的记录条数超过这个值则会进行分拆,值根据具体数据量设置
set hive.groupby.skewindata=true; --如果是group by过程出现倾斜 应该设置为true
set hive.skewjoin.key=100000; --这个是join的键对应的记录条数超过这个值则会进行分拆,值根据具体数据量设置
set hive.optimize.skewjoin=true;--如果是join 过程出现倾斜 应该设置为true
- split拆分
例如某个字段名value为:
o498X0bh9ySoH2h3ijQWMDzYSrjk_852_010002_2_319527329_南通_商丘_2019-01-29_1_367347825_780_1小时50分钟"
抽取最后一个字段"1小时50分钟:
split(value,'_')[size(split(value, "_" )) - 1 ] as min_time,
- regexp_extract抽取字段
如果想抽取1小时50分钟中的"1"和"50"转换成秒
原数据min_time可能为:1小时50分钟,2时50分钟,2时,35分钟。如下:
if(regexp_extract(min_time,'^([0-9]*?)小?时.*$',1)>0,regexp_extract(min_time,'^([0-9]*?)小?时.*$',1),0)*3600+if(regexp_extract(min_time,'([0-9]*?)分钟.*$',1)>0 ,regexp_extract(min_time,'([0-9]*?)分钟.*$',1),0)*60 as min_time
- explode用法
SELECT from_city_name, to_city_name, search_num, traffic_type
FROM (
SELECT from_city_name, to_city_name, COUNT(*) AS search_num
, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
FROM mid_trafficwisdom.search_direct_log
WHERE log_date = '2019-02-26'
AND from_city_name IS NOT NULL
AND to_city_name IS NOT NULL
GROUP BY from_city_name, to_city_name
) d
LATERAL VIEW explode(traffic_types) adTable AS traffic_type
- 时间转换
from_unixtime(cast (time as int) , 'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(cast(substring(time,1,10) as int),'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(unix_timestamp(date, 'yyyyMMdd'), 'yyyy-MM-dd') as create_date
to_date(string timestamp) //日期时间转日期函数
select date_sub('2012-12-08',10) from dual; //2012-11-28
- 字符串拼接
CONCAT_WS('+',M.wtboscheduleno,M.wtbobusno) AS bus_code
- json解析字段
wtodatasourcenew字段内容为:
{
"abTest": [{
"expNo": "20180716_JLH201807",
"expGroup": "Z"
}, {
"expNo": "20181210_BI",
"expGroup": "Z"
}, {
"expNo": "20190114_guolv",
"expGroup": "Z"
}, {
"expNo": "20181022_PaiXu",
"expGroup": "Z"
}],
"requestId": "c6e8bf08a64ffee900988b3124ee4442"
}
SELECT *,GET_JSON_OBJECT(wtodatasourcenew,'$.requestId')
FROM base_tcdctrafficwisdomtravelorder.wisdomtravelorder where wtocreatedate BETWEEN '2019-03-17 00:00:00' AND '2019-03-17 23:59:59'
LIMIT 100
- 取小数点
CASE
WHEN round(b.order_num * 1.0 / a.search_num, 6) IS NULL THEN 0
ELSE round(b.order_num * 1.0 / a.search_num, 6)
END AS allpercent