Hive一些常用语法

  • like与rlike区别:like 是通配符,rlike是正则
  • 取消hive一些语法限制:set hive.mapred.mode=nonstrict;
  • 设置Reduce-Task(增加运行速度):set mapred.reduce.tasks = 30;
  • count(*)计算的时候包含了NULL值,而count(expr)则不包含空值,例如:
select count(*) from (select disctinct column from xxx) x
Select count(distinct column) from xxx
结果不一样,因为count(distinct column)已经去null了
  • 建表语句
use mid_trafficwisdom;
drop table if exists city_with_station;
create table  if not exists city_with_station(
start_city_id bigint comment '起始城市ID',
start_city_name string comment '起始城市名',
end_city_id bigint comment '到达城市ID',
end_city_name string comment '到达城市名',
com_start_city_id bigint comment '公共起始城市ID',
com_start_city_name string comment '公共起始城市名',
com_end_city_id bigint comment '公共到达城市ID',
com_end_city_name string comment '公共到达城市名',
traffic_type string comment '交通类型',
section_station_set string comment '站到站集合',
create_time string comment '数据插入时间'
)comment '城市包含站-站集合'
location '/data/train/trafficwisdom/mid/city_with_station'
  • 插入分区表
INSERT OVERWRITE TABLE mid_trafficwisdom.search_transfer_log PARTITION(push_date ='2019-01-29' )
SELECT from_city_name,to_city_name,start_date,traffic_type,from_station_name,to_station_name,from_station_code,to_station_code,send_time
from tmp_trafficwisdom.search_transfer_log_tmp where from_city_name is not null and to_city_name is not null
  • 删除分区表:
ALTER TABLE mid_trafficwisdom.order_train_log DROP PARTITION (log_date = '2019-01-29');
//批量删除
ALTER TABLE ota_interface_check_info_log DROP PARTITION(log_date <='2017-08-10')

修改表名

ALTER TABLE name RENAME TO new_name
  • 分区表批量插入分区表
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict; 
INSERT OVERWRITE TABLE mid_trafficwisdom.ml_cross_data PARTITION (log_date)
SELECT D.userid, D.city, D.category, D.from_place, D.to_place
	, D.isclick, D.future_day, D.hour_time, D.banner_min_price, D.banner_min_time
	, D.start_city_id, D.start_city_name, D.end_city_id, D.end_city_name, D.push_date
FROM tmp_trafficwisdom.ml_cross_data D 
  • 修改表字段
ALTER TABLE app_trafficwisdom.route_traffic_transfer_without_date CHANGE transfer_overtime transfer_overtime bigint comment '第一段途径时间'
  • ROW_NUMBER使用
SELECT transport_code, start_station_code, start_station_name, end_station_code, end_station_name
	, run_time, price, create_date
FROM (
	SELECT *, ROW_NUMBER() OVER (PARTITION BY transport_code, start_station_code, end_station_code ORDER BY create_date DESC) AS rn
	FROM mid_trafficwisdom.section_ticket_price
	WHERE traffic_type = 'F'
) A WHERE A.rn = 1
  • 一行拆多行(explode)
SELECT *
FROM (
	SELECT from_city_name, to_city_name, COUNT(*) AS search_num
		, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
	FROM mid_trafficwisdom.search_line_log
	WHERE push_date = '2019-01-29'
		AND from_city_name IS NOT NULL
		AND to_city_name IS NOT NULL
	GROUP BY from_city_name, to_city_name
) d
	LATERAL VIEW explode(traffic_types) adTable AS traffic_type limit 100;

多列合并一列

SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
					, end_station_name, seat_infos.seat_name, seat_infos.seat_price
				FROM (
					SELECT transport_code, duration_date, start_station_code, start_station_name, end_station_code
						, end_station_name
						, array(
                            named_struct('seat_name', '硬座', 'seat_price', hard_seat_price), 
                            named_struct('seat_name', '无座', 'seat_price', no_seat_price), 
                            named_struct('seat_name', '软卧上', 'seat_price', soft_sleeper_up_price), 
                            named_struct('seat_name', '软卧下', 'seat_price', soft_sleeper_down_price), 
                            named_struct('seat_name', '硬卧上', 'seat_price', hard_sleeper_up_price), 
                            named_struct('seat_name', '硬卧中', 'seat_price', hard_sleeper_mid_price), 
                            named_struct('seat_name', '硬卧下', 'seat_price', hard_sleeper_down_price), 
                            named_struct('seat_name', '二等座', 'seat_price', second_seat_price), 
                            named_struct('seat_name', '一等座', 'seat_price', first_seat_price), 
                            named_struct('seat_name', '商务座', 'seat_price', business_seat_price)) AS seat_info
					FROM mid_trafficwisdom.ticket_train_log
					WHERE duration_date >= '{today,yyyy-MM-dd}'
				) M
					LATERAL VIEW explode(seat_info) seat_info AS seat_infos
				WHERE seat_infos.seat_price IS NOT NULL	AND seat_infos.seat_price > 0;
  • 数据倾斜设置
set hive.groupby.mapaggr.checkinterval=100000 ;--这个是group的键对应的记录条数超过这个值则会进行分拆,值根据具体数据量设置
set hive.groupby.skewindata=true; --如果是group by过程出现倾斜 应该设置为true
set hive.skewjoin.key=100000; --这个是join的键对应的记录条数超过这个值则会进行分拆,值根据具体数据量设置
set hive.optimize.skewjoin=true;--如果是join 过程出现倾斜 应该设置为true
  • split拆分
    例如某个字段名value为:
    o498X0bh9ySoH2h3ijQWMDzYSrjk_852_010002_2_319527329_南通_商丘_2019-01-29_1_367347825_780_1小时50分钟"
    抽取最后一个字段"1小时50分钟:
 split(value,'_')[size(split(value, "_" )) - 1 ] as min_time,
  • regexp_extract抽取字段
    如果想抽取1小时50分钟中的"1"和"50"转换成秒
    原数据min_time可能为:1小时50分钟,2时50分钟,2时,35分钟。如下:
if(regexp_extract(min_time,'^([0-9]*?)小?时.*$',1)>0,regexp_extract(min_time,'^([0-9]*?)小?时.*$',1),0)*3600+if(regexp_extract(min_time,'([0-9]*?)分钟.*$',1)>0 ,regexp_extract(min_time,'([0-9]*?)分钟.*$',1),0)*60 as min_time
  • explode用法
SELECT from_city_name, to_city_name, search_num, traffic_type
FROM (
	SELECT from_city_name, to_city_name, COUNT(*) AS search_num
		, Array('TT', 'TB', 'TF', 'FB', 'FT', 'BF', 'BT') AS traffic_types
	FROM mid_trafficwisdom.search_direct_log
	WHERE log_date = '2019-02-26'
		AND from_city_name IS NOT NULL
		AND to_city_name IS NOT NULL
	GROUP BY from_city_name, to_city_name
) d
	LATERAL VIEW explode(traffic_types) adTable AS traffic_type
  • 时间转换
from_unixtime(cast (time as int) , 'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(cast(substring(time,1,10) as int),'yyyy-MM-dd HH:mm:ss') as creat_time
from_unixtime(unix_timestamp(date, 'yyyyMMdd'), 'yyyy-MM-dd') as create_date
to_date(string timestamp)   //日期时间转日期函数
select date_sub('2012-12-08',10) from dual;  //2012-11-28
  • 字符串拼接
CONCAT_WS('+',M.wtboscheduleno,M.wtbobusno) AS bus_code
  • json解析字段
    wtodatasourcenew字段内容为:
{
	"abTest": [{
		"expNo": "20180716_JLH201807",
		"expGroup": "Z"
	}, {
		"expNo": "20181210_BI",
		"expGroup": "Z"
	}, {
		"expNo": "20190114_guolv",
		"expGroup": "Z"
	}, {
		"expNo": "20181022_PaiXu",
		"expGroup": "Z"
	}],
	"requestId": "c6e8bf08a64ffee900988b3124ee4442"
}
SELECT *,GET_JSON_OBJECT(wtodatasourcenew,'$.requestId')
FROM base_tcdctrafficwisdomtravelorder.wisdomtravelorder where wtocreatedate  BETWEEN '2019-03-17 00:00:00' AND '2019-03-17 23:59:59'
LIMIT 100 
  • 取小数点
CASE 
	WHEN round(b.order_num * 1.0 / a.search_num, 6) IS NULL THEN 0 
	ELSE round(b.order_num * 1.0 / a.search_num, 6) 
END AS allpercent
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值