1、创建一张外部分区表(分区需要另外添加)
-- 创建方式1(hive不能用多个字符作为列分隔符)
drop table if exists stg.stg_dev_location_info_d;
CREATE TABLE `stg.stg_dev_location_info_d`
(`device_id` string COMMENT '用户唯一识别ID',`device_brand` string COMMENT '设备品牌',`device_model` string COMMENT '设备型号',`system_version` string COMMENT '系统版本',`system_name` string COMMENT '操作系统名称')
COMMENT 'stg-设备基站信息'
PARTITIONED BY(day string,source string)
row format delimited
fields terminated by '\t' collection items terminated by ',' map keys terminated by ':' lines terminated by '\n'
stored as orc;
-- 创建方式2
CREATE EXTERNAL TABLE `ext_base_click`(
`reqid` string,
`adslotid` string,
`ip` string,
`timestamp` string,
`adgroupid` string,
`lbs` string)
PARTITIONED BY (
`date` string,
`hour` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='|',
'line.delim'='\n',
'serialization.format'='|')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
2、创建一张外表(需写好分区位置)
CREATE EXTERNAL TABLE `b_base_adx`(
`id` string,
`enname` string,
`zhname` string,
`description` string,
`supportdeal` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='|',
'line.delim'='\n',
'serialization.format'='|')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://tercel/user/mapred2/hive_basetable/b_base_adx'
3、表字段的修改(注意锁表问题、ORC文件表字段的修改、矢量查询)
--hive 新增一列
ALTER TABLE dsp.int_optimize_bidrequest ADD COLUMNS (app_id String);
--hive 修改列名
ALTER TABLE ext_preprocess_click CHANGE col_old_name col_new_name String AFTER column_name;
-- 修改列类型
Alter table stg.xh_temp_netinfo_d change column st st BIGINT
--hive 修改列注释
alter table stg.stg_dev_xxx_info_d CHANGE COLUMN is_root is_root string comment '是否获取root权限: 1表示获取, 0表示没获取';
--hive 删除替换全部列(禁止删除列会有问题)
ALTER TABLE ext_base_conversion REPLACE COLUMNS (reqid string, adslotid string, adx string, deviceid string);
--解锁hive表
show locks your_table
unlock table your_table
4、hive常用语句
--新增外部表分区(数据不会移动)
alter table media_stats add partition (date ="2018-02-02",hour=5) location 'hdfs://tercel/user/DAILY/$day/media-stats';
--外部分区表load数据(数据会移动到表下面)
load data local inpath '/tmp/pck_num.txt' OVERWRITE into table stg.stg_app_net_info_d partition (day='20200916');
--判断字符串是否都为数字
select * from base_media_baidu where '123456' rlike '^\\d+$';
--删除表分区
ALTER TABLE dsp.ext_preprocess_bidrequest DROP IF EXISTS PARTITION (date ="1018-01-05",hour=5);
--添加hive_udf
add jar hdfs://tercel//user/mapred2/myudf/hive/hiveudf.jar;
create temporary function md5 as 'com.Md5';
5、分割字符串’,’ (行转列 | 列转行)
-- 行转列
select adslotallowedsize_
from dsp_media.b_report_flow_daily lateral view explode(split(adslotallowedsize,'_')) b AS adslotallowedsize_
where date='2018-03-07'
-- 列转行
select
device_id,
concat_ws(',',collect_set(package_name)) AS package_name
from tmp.stg_dev_user_bhv_d_old
where day=20170301
group by device_id
6、select除了某些字段之外的剩余所有字段
set hive.support.quoted.identifiers=None;
select `(name|id|pwd)?+.+` from tableName;
7、不同的adx中的曝光top3的机型
select *
from
(
select adx,brandmodelid,c,row_number() over (partition by adx order by adx,c desc) ord
from
(
select adx,brandmodelid,count(1) c from dsp.ext_base_impression WHERE date='2017-07-03' AND hour=4
group by adx,brandmodelid
) t1
) t2
where ord<=3;
8、设备信息的组内占比
select a.source AS source,
a.sdk_version AS sdk_version,
a.device_info_count AS device_info_count,
sum(a.device_info_count) over(partition by a.source) as group_device_info_count
from
(select source, split(sdk_version,'\\|')[0] AS sdk_version, count(1) AS device_info_count
from stg.stg_dev_device_info_d
where day=20180426
group by source, split(sdk_version,'\\|')[0]
order by cast(source AS int),sdk_version) AS a
9、动态分区插入数据
INSERT OVERWRITE TABLE tmp.tmp_stg_dev_user_bhv_d
PARTITION (day = '20181115', source = '200')
SELECT device_id,package_name,app_key,open_time,close_time,sample_package_name,device_brand,device_model,system_version,system_name,sdk_version,ip,sample_app_version,app_channel,tmp_id,count(1) AS cou
FROM stg.stg_dev_user_bhv_d
where day=20181115 and source=200
group by device_id,package_name,app_key,open_time,close_time,sample_package_name,device_brand,device_model,system_version,system_name,sdk_version,ip,sample_app_version,app_channel,tmp_id
order by cou desc
10、hive常用函数
-- json 查询
select get_json_object("{\"imei\":\"368173410356449\",\"mac\":\"008B2F377231\"}",'$.mac')
-- map查询
select map('k1','v1')['k1']
11、创建视图
-- 创建分区
create view test_view as select * from test