-2、外连接:right outer join
-1、加路径
hive <<EEOOFF
use lilu;
add file 60_sms_dun_telandphone/dun_org_name_map.txt;
add file 60_sms_dun_telandphone/rule_zhengye_orgcode.py;
0、byobu程序在服务器运行;本地hive能运行的程序,放到linux环境中用sh脚本运行,不一定能成功!
cd /home/lilu/tmp_0505
bash run_prepare.sh 20160531 340
bash round2_rule_1.sh 20170506
1、hive判断是否为NULL
用 where 字段a is null
用 where length(a)>02、hive建表、导入txt文件:
USE databasename;
CREATE TABLE if not exists temp_test(
masterhotel int COMMENT '酒店ID',
ciiquantity double COMMENT '间夜量'
)
row format delimited
fields terminated by '\t'
stored as textfile;
hive> load data local inpath 'city1.txt' into table temp_test;
分区表:create table partition_test
(member_id string,
name string
)
partitioned by (
stat_date string,
province string)
row format delimited fields terminated by ',';
alter table partition_test add partition (stat_date='20110728',province='zhejiang');#创建好了一个分区
insert overwrite table partition_test partition(stat_date='20110728',province='henan') select member_id,name from partition_test_input where stat_date='20110728' and province='henan';#向partition_test的分区中插入数据
set hive.exec.dynamic.partition=true;#动态分区
insert overwrite table partition_test partition(stat_date='20110728',province)#主分区采用静态列,副分区采用动态
set hive.exec.dynamic.partition.mode=nostrick;#允许分区列全部是动态的
LOAD DATA local INPATH '/user/hua/*' INTO TABLE day_hour partition(dt='2010-07- 07');
3、hive求差集:
hive> select * from zz0;
111111
222222
888888
hive> select * from zz1;
111111
333333
444444
888888
select * from zz0 full outer join zz1 on zz0.uid = zz1.uid;
NULL
111111 111111
222222 NULL
NULL 333333
NULL 444444
888888 888888
求结果的并集:
create table lilu.dun_all_0428_2 as
select distinct collection_number from tmp1
union all select distinct collection_number from tmp2
union all select distinct collection_number from tmp3;
4、join:
hive> select * from zz0 join zz1 on zz0.uid = zz1.uid;
111111 111111
888888 888888
5、数据表导出为文件
insert overwrite local directory '/home/lilu/data_for_lilu_0316_1' row format delimited fields terminated by '\t'
select * from databasename.temp_test;
6、两个表join得到一张新表
set mapreduce.reduce.memory.mb=4096;
create table temp0 as
select a.*,b.* from dun_0310 as a join tb_v4 as b on a.tel_1 = b.other_cell_phone; dun_0310 放小表
7、合并文件
cd /home/lilu/data2
cat 0* > temp.txt
8、随机取数据
select * from temp_test order by rand() limit 1000;
9、日期格式转换:将20160101转化成2016-0101
yyyymmdd格式:计算日期最大值、最小值:
create table temp_ll1 as select tel,max(pt_dt) as max_dt,min(pt_dt) as min_dt from temp_details_123m_3 group by tel;
yyyymmdd格式转化为yyyy-mm-dd格式,计算年月日:
create table tb1 as
select from_unixtime(unix_timestamp(time1,'yyyymmdd'),'yyyy-mm-dd') as t1 from temp_tb0;
select year(t1) as year,month(t1) as month,day(t1) as day from tb1;
yyyy-mm-dd格式:计算两个日期相差几天
create table temp_total_span as select tel as tel1,datediff(max_dt,min_dt) as total_span from temp_ll2;
日期时间转日期:
select to_date('2011-12-08 10:03:01') from temp_tb1;
10、将一张表中的数据插入到另一张表
insert into table temp_details_5m2 select * from temp_details_7m1;
11、查看、删除job
hadoop job -list
hadoop job –kill job_id
12、建数据库
create database lilu;
13、
select tel,year,month,if(imsis>10,1,0) as imsis10 from tb_323210_imsis;
select case when 1=1 then 'tom' when 2=2 then 'mary' else'tim' end from lxw_dual;
14、根据key_number列去重
create table tb_all_tel as
select *,row_number() over(distribute by key_number) rn from zhangwj.lilu_all_tel;
create table tb_tel1 as
select * from tb_all_tel where rn=1;
15、streaming理解为将hive表里的数据按行交到python程序中执行,最终将结果还按行写入到一张hive表里面。
16、修改表名:
ALTER TABLE name CHANGE column_name new_name new_type
17、需要中间结果命名一下
select distinct collection_number
from (select distinct collection_number from tmp1union all select distinct collection_number from tmp2_1
union all select distinct collection_number from tmp3
union all select distinct collection_number from tmp5) tt;
18、select *,
case when sms_splitter_str like "%|催收企业名%" then 1
else 0 end as have_org
from tb_sms_sample_dun_telandphone_rule_4;
19、no score
12
13
13
22
24
select no,collect_set(score) as score_set from tablss group by no;
结果:noscore_set
1[2,3]
2[2,4]
select no,collect_set(score)[0] from tablss group by no;
no score
12
2 2
20、group by放在where后面,向hive表中插入数据。insert overwrite table tb_result 若表中本来存在数据,覆盖表中的数据。
insert overwrite table tb_result_rule_6_all
select collection_number,min(pt_dt) as min_date,max(pt_dt) as max_date
from tb_sms_sample_dun_telandphone
where (rule_tag=3 or come_from=1) and length(collection_number)>3
group by collection_number;
21、hive表直接导入到mysql表
sqoop export \
-connect 'jdbc:mysql://192.168.109.200:3306/db_tel_manage?useUnicode=true&characterEncoding=utf8' \
-username root -password root123 \
-table tb_result_rule_union_all \
-export-dir /user/hive/warehouse/qihu_sms_4.db/tb_result_rule_union_all \
--fields-terminated-by '\001' \
--lines-terminated-by '\n'
22、字符串截取函数:substr
语法: substr(string A, int start, int len)
hive> select substr('abcde',3,2) from lxw_dual;
cd
23、 正则表达式替换函数: regexp_replace语法: regexp_replace(string A, string B, string C) 说明:将字符串A中的符合java正则表达式B的部分替换为C。
hive> select regexp_replace('foobar', 'oo|ar', '') from lxw_dual;
fb
24、正则表达式解析函数:regexp_extract
hive> select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) fromlxw_dual;
foothebar
25、hive、java匹配手机号码
select regexp_extract('8618750930899', '1[34578]\d{9}', 0) ;
移动:134、135、136、137、138、139、150、151、157(TD)、158、159、187、188
联通:130、131、132、152、155、156、185、186
电信:133、153、180、189
"^((13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$"((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}regexp_extract(sms_sender,'(1[34578]\\d{9}|0(10|2[0-5789]|\\d{3})\\d{7,8})',0) as sms_sender