案例1:
10001 700 2019-03-01
10002 720 2019-03-02
10003 740 2019-03-03
10004 760 2019-04-01
10005 780 2019-04-02
10006 800 2019-04-03
10007 820 2019-05-01
10008 840 2019-05-02
10009 860 2019-06-01
10010 880 2019-06-02
10011 900 2019-06-03
把以上数据利用动态分区导入表中(按照年、月进行多级分区)
(1)先创建一个动态分区表
create table order_partition(
order_number string,
order_price double
)
partitioned BY(year string,month string)
row format delimited fields terminated by '\t';
(2)创建一个普通的内部表
create table tt_order(
order_number string,
order_price double,
order_time
)
row format delimited fields terminated by '\t';
(3)加载数据到普通表
load data local inpath '/opt/bigdata2.7/hive/tempdata/order.txt' into table tt_order;
(4)加载数据到动态分区表中
--想要进行动态分区,首先要设置参数
set hive.exec.dynamic.partition = true;//使用动态分区
set hive.exec.dynamic.partition.mode = nonstrict;//非严格模式
–语句
insert into table order_partition partition(year,month) select
order_number,order_price,substring(order_time,0,4) as year,substring(order_time,6,2) as month
from tt_order;
案列2:
(1)创建user_trade(用户交易表)
create table if not exists user_trade(
user_name string,
piece int,
price double,
pay_amount double,
goods_category string,
pay_time bigint
)partitioned by (dt string)
row format delimited fields terminated by '\t';
(2)先设置动态分区:
set hive.exec.dynamic.partition = true;
set hive.exec.dynamic.partition.mode = nonstrict;
set hive.exec.max.dynamic.partitions = 10000;
set hive.exec.max.dynamic.partitions.pernode =10000;
(3)上传数据到HDFS:
hdfs dfs -put /opt/bigdata2.7/hive/tempdata/user/* /user/hive/warehouse/db_hive.db/user_trade;
(4)进行修复
msck repair table user_trade;
(5)查出2019年一月到四月,每个品类有多少人购买,累计金额是是多少
select goods_category,count(distinct user_name) as user_num,
sum(pay_amount) as total_amount
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by goods_category;
(6)查出2019年4月,支付金额超过5万元的用户
select user_name,sum(pay_count) as total_amount
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by user_name having sum(pay_count)>50000;
(7)查出2019年4月,支付金额最多的top5用户
select user_name,
sum(pay_amount) as total_amount
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by user_trade
order by total_amount desc
limit 5;
案例3:
(1)创建user_info(用户信息表)
``create table if not exists user_info(user_id string,
user_name string,
sex string.
age int,
city string,
firststactivetime string,
level int,
extra1 string,
extra2 map<string,string>
)
row format delimited fields terminated by '\t’
collection items terminated by ','
map keys terminatede by ':'
lines terminated by ‘\n’
stored as textfile
(2)用户首次激活时间,与2019-05-01的日期间隔
select user_name,
datadiff('2019-05-01',to_date(firststactivetime))
from user_info
limit 10;
(3)统计一下四个年龄段,20岁一下,2020,3040,40以上的用户数
select case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else '40岁以上' end,count(distinct user_id) use_num
from user_info
group by case
when age<20 then '20岁以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else '40岁以上' end;
(4)统计每个性别用户等级高低的分布情况(level大于5为高级)
select sex,
if(level>5,'高','低'),
count(distinct user_id) user_num
from user_info
group by sex,if(level>5,'高','低');
(5)统计每一月新激活的用户数
select substr(firststactivetime,1,7) as month,
count(distinct user_id) user_num
form user_info
group by substr(firststactivetime,1,7);
(6)统计不同手机品牌的用户数
方案一:
select get_json_object(extra1,$.phonebrand) as phone_brand,
count(distinct user_id) as user_num
from user_info
group by get_json_object(extra1,$.phonebrand);
方案二:
select extra2['phonebrand'] as phone_brand,
count(distinct user_id) as user_num
from user_info
group by extra2['phonebrand'];
案例4:
根据案例2的表和案列3的表
再创建user_refund(用户退款信息表)
(1)创建user_refount表
create table if not exists user_refund()
user_name string,
refund_piece int,
refund_amount double,
refund_time string)
partition by(dt string)
row format delimited fields terminated by '\t';
(2)在2019年购买后又退款的用户
select a.user_name
from
(select distinct user_name
from user_trade
where year(dt)=2019) a
join
(select distinct user_name
from user_refund
where year(dt)=2019)b
on a.user_name = b.user_name;
(3)在2019年购买,但是没有退款的用户
select a.user_name
from
(select distinct user_name
from user_trade
where year(dt)=2019) a
left join
(select distinct user_name
from user_refund
where year(dt)=2019)b
on a.user_name = b.user_name;