一、错误
1.运行到一半任务断掉:空指针
2.资源分配与map数(任务较大时建议考虑其他方法)
3.row_number_old时行出现空值(udf函数)
4.暂不支持update和delete操作
5.注意hive锁表
SET mapreduce.job.queuename=highlevel;
SET hive.support.concurrency=false;--此功能(这两句不用语句解释,否则无法运行)
二、主要语句
--建表
create table hue_10086_url_output(url string, Login_UserNumber string ,mailKey string ,Max_Age string ,provCode string ,areaCode string ,time_consume string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
location '/user/hue_10086_url_output'
--其中若有中文需用notepad++转化为utf8无bom编码格式
--更新数据(--hbase是key/value类型数据)insert overwrite 覆盖功能,insert into简单插入功能
insert into table hbase_package_programname
select apk_name as package_name,program_name from hue_zouyanmei_packageprogram_buchong0915
--建list表(collect_set(named_struct))
create table hue_apt_attack0206
location '/user/hue_apt_attack0206'
as
select time , source , downloadurl ,APT , title ,
collect_set(named_struct("hash", hash, "hashtag", hashtag)) hash_info,
collect_set(named_struct("ip", ip, "iptag", iptag)) ip_info,
collect_set(named_struct("domain", domain, "domaintag", domaintag)) domain_info,
collect_set(named_struct("url", url, "urltag", urltag)) url_info,
collect_set(named_struct("reference", reference)) reference_info
from hue_apt_attack
group by time , source , downloadurl ,APT , title
--udf函数使用
create table hue_10086_url_output_city as
select * ,callerlocation(substr(login_usernumber,3,11),1) city ,callerlocation(substr(login_usernumber,3,11),2) province,
callerlocation(substr(login_usernumber,3,11),3) card_type from hue_10086_url_output where login_usernumber!='err'
--is not NULL
select max(url),imei_p,login_usernumber from hue_url1207 where login_usernumber !='err' and login_usernumber is not NULL
group by imei_p,login_usernumber
--string转化为数组array
creat table hue_imei1208
location '/user/hue_imei1208'
select imei_p, collect_set(login_usernumber) from hue_url1207 where imei_p != 'ERR_N' and login_usernumber !='err' and login_usernumber is not NULL group by imei_p
--array转化成多个string
select * from (select *,adid from ods_url_dns lateral view explode(us) ad as adid where dt='20171211') a where adid like '1%' limit 10
--在一个表中不在另一表中(类似于不等值连接)
select a.url,a.dt from
(select a.url,a.dt,b.url url1 from
(select a.* from mds_engine_url_dt a where a.dt>='20171201' and a.dt<='20171231')a
left outer join
(select url from ods_url_dwz group by url) b
on a.url=b.url)a
where a.url1 is null
--udf函数(目的挑选出数目最多的一个)
select a.domain,a.url,a.a from
(
select row_number_old(b.domain) seq_no,b.domain,b.url,b.a from
(
select * from
(
select case when instr(url,'www.')>0 and instr(url,'http://')<=0 and instr(url,'https://')<=0 then split(split(url,'\\/')[0],'www.')[1]
when (instr(url,'http://')>0 or instr(url,'https://')>0) and instr(url,'www.')<=0 then split(url,'/')[2]
when (instr(url,'http://')>0 or instr(url,'https://')>0) and instr(url,'www.')>0 then split(split(url,'\\/')[2],'www.')[1]
when instr(url,'http://')<=0 and instr(url,'https://')<=0 and instr(url,'www.')<=0 then split(url,'/')[0] end as
domain,url,count(*) as a
from hue_domain_url0105 where dt>='20171201' and dt<='20171231'
group by case when instr(url,'www.')>0 and instr(url,'http://')<=0 and instr(url,'https://')<=0 then split(split(url,'\\/')[0],'www.')[1]
when (instr(url,'http://')>0 or instr(url,'https://')>0) and instr(url,'www.')<=0 then split(url,'/')[2]
when (instr(url,'http://')>0 or instr(url,'https://')>0) and instr(url,'www.')>0 then split(split(url,'\\/')[2],'www.')[1]
when instr(url,'http://')<=0 and instr(url,'https://')<=0 and instr(url,'www.')<=0 then split(url,'/')[0] end
,url
)a distribute by a.domain sort by a.domain,a.a desc)b
)a where seq_no=1 limit 500
-修改某列数据,并增加一列dt(一般先备份一个原表)
create table tds_package_systembk11 as
select package_name,mau,(case when package_name='com.aliyun.pwmob.h_izz_cc'
then '微语录'
when package_name='com.gionee.livewallpaper' then 'Ami天气动态壁纸-随变'
when package_name='com.gionee.livewallpaper.v2' then '天气动态壁纸'
when package_name='com.bbk.stickdefence' then 'Stickman Warriors: Cartoon Wars'
when package_name='com.letv.kttvControl' then '万能遥控器'
else program_name end) program_name,tag,
'' as dt
from tds_package_system
--改表名
alter table tds_package_systembk11 rename to tds_package_system
--转换成日期格式
from_unixtime(cast(receive_time as int ),'yyyyMMdd')
SQL语法可参考网址:http://www.w3school.com.cn/