一、hive创建表并加载数据
1、在 hive 里创建表 dept
在数据库default下,创建表dept
hive > create table dept(deptno string, dname string, loc string)
row format delimited fields terminated by '\t';
2、加载数据到表 dept
在linux的目录/root/下,创建dept.txt
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
方式一:
用 local inpath 本地路径的方法加载
hive> load data local inpath '/root/dept.txt' into table default.dept;
方式二:
再用 hdfs 路径的方法加载数据覆盖到 dept
上传文件到hdfs
hive > dfs -put /root/dept.txt /user/hive;
加载hdfs上的数据覆盖到hive
hive > load data inpath '/user/hive/dept.txt' overwrite into table default.dept;
二、编写sql语句
1、customer-deposit-dept
客户信息表customer
字段 | 数据类型 | 描述 |
---|---|---|
c_id | varchar2 | 主键,客户号 |
c_name | varchar2 | 客户姓名 |
存款交易表deposit
字段 | 数据类型 | 描述 |
---|---|---|
trade_date | date | 存款日期 |
deposit_id | varchar2 | 流水号 |
c_id | varchar2 | 客户号 |
amount | number | 存款金额 |
dept_id | varchar2 | 营业网点代码 |
营业网点信息表dept
字段 | 数据类型 | 描述 |
---|---|---|
dept_id | varchar2 | 营业网点代码 |
dept_name | varchar2 | 营业网点名称 |
存款日期的数据类型默认为 date 类型,
若存款日期的数据类型为 varchar2类型的,对应的过滤数据的方法不一样
1)查找客户号“88888888”的2021全年的存款流水,按日期升序排列
select t1.trade_date
,t1.deposit_id
,t1.c_id
,t1.amount
,t1.dept_id
from deposit t1
where t1.c_id = '88888888'
and to_char(t1.trade_date, 'yyyy') = '2021'
order by t1.trade_date asc;
2)针对上面的查询要求,为deposit表创建一个索引idx_deposit
create unique index index_c_id on deposit(c_id);
3)输出整个银行2021年全年各网点名称及对应的存款总额。
select t1.dept_name
,sum(t2.amount) as sum_amount
from dept t1
left join deposit t2
on t1.dept_id = t2.dept_id
and to_char(t2.trade_date, 'yyyy') = '2021'
group by t1.dept_id, t1.dept_name;
4)列出曾经有多笔(2笔或者大于2笔)存款的客户号。
select t1.c_id
from deposit t1
group by t1.c_id
having count(1) > 1;
- 列出客户信息表中没有存款交易的客户,输出客户号和客户姓名。
--表关联
select t1.c_id
,t1.c_name
from customer t1
left join deposit t2
on t1.c_id = t2.c_id
where t2.c_id is null;
--not in
select t1.c_id
,t1.c_name
from customer t1
where t1.c_id not in (
select distinct t2.c_id
from deposit t2
);
--not exists
select t1.c_id
,t1.c_name
from customer t1
where not exists(
select t.c_id
from deposit t
where t.c_id = t1.c_id
);
2、t_tran
表t_tran
card_nbr | trans_type | desc_nar | inp_date | bill_amt |
---|---|---|---|---|
6223184011 | 1040 | 批量代扣 | 20220426 | 4850 |
6223184011 | 1040 | 批量代扣 | 20220426 | 4850 |
6222828235 | 1022 | 网联支付 | 20220426 | 12100 |
6222828235 | 1022 | 网联支付 | 20220426 | 899400 |
6222828234 | 1022 | 网联支付 | 20220426 | 400 |
6222828234 | 1022 | 网联支付 | 20220426 | 400 |
查询出发生重复(每个字段的内容都一样)记录的数据
--理解1:查询出重复数据中的一条
select t1.card_nbr
,t1.trans_type
,t1.desc_nar
,t1.inp_date
,t1.billamt
from t_tran t1
group by t1.card_nbr
,t1.trans_type
,t1.desc_nar
,t1.inp_date
,t1.billamt
having count(1) > 1;
--理解2:查出所有重复数据
select t1.card_nbr
,t1.trans_type
,t1.desc_nar
,t1.inp_date
,t1.bill_amt
from (
select t.card_nbr
,t.trans_type
,t.desc_nar
,t.inp_date
,t.bill_amt
,count(1)over(partition by t.card_nbr
,t.trans_type
,t.desc_nar
,t.inp_date
,t.bill_amt
) as count_row
from t_tran t
) t1
where t1.count_row > 1;
3、a-b
表a
id | product | type |
---|---|---|
1000100001 | 2016070500000001 | 1 |
1000100001 | 2014080100000001 | 2 |
1000100001 | 2017033000000001 | 1 |
表b
pc_id | id | level |
---|---|---|
1001 | 1000100001 | a |
1002 | 1000100001 | a |
1003 | 1000100001 | c |
1004 | 1000100002 | g |
1005 | 1000100002 | b |
1006 | 1000100003 | d |
1、请写出运行结果:select count(*) from a left join b on a.id=b.id;
9
a.id | product | type | pc_id | b.id | level |
---|---|---|---|---|---|
1000100001 | 2016070500000001 | 1 | 1001 | 1000100001 | a |
1000100001 | 2016070500000001 | 1 | 1002 | 1000100001 | a |
1000100001 | 2016070500000001 | 1 | 1003 | 1000100001 | c |
1000100001 | 2014080100000001 | 2 | 1001 | 1000100001 | a |
1000100001 | 2014080100000001 | 2 | 1002 | 1000100001 | a |
1000100001 | 2014080100000001 | 2 | 1003 | 1000100001 | c |
1000100001 | 2017033000000001 | 1 | 1001 | 1000100001 | a |
1000100001 | 2017033000000001 | 1 | 1002 | 1000100001 | a |
1000100001 | 2017033000000001 | 1 | 1003 | 1000100001 | c |
2、请写出运行结果:select count(*) from a right join b on a.id=b.id;
12
pc_id | b.id | level | a.id | product | type |
---|---|---|---|---|---|
1001 | 1000100001 | a | 1000100001 | 2016070500000001 | 1 |
1001 | 1000100001 | a | 1000100001 | 2014080100000001 | 2 |
1001 | 1000100001 | a | 1000100001 | 2017033000000001 | 1 |
1002 | 1000100001 | a | 1000100001 | 2016070500000001 | 1 |
1002 | 1000100001 | a | 1000100001 | 2014080100000001 | 2 |
1002 | 1000100001 | a | 1000100001 | 2017033000000001 | 1 |
1003 | 1000100001 | c | 1000100001 | 2016070500000001 | 1 |
1003 | 1000100001 | c | 1000100001 | 2014080100000001 | 2 |
1003 | 1000100001 | c | 1000100001 | 2017033000000001 | 1 |
1004 | 1000100002 | g | |||
1005 | 1000100002 | b | |||
1006 | 1000100003 | d |
3、请写出运行结果:select count(*) from a inner join b on a.id=b.id
9
a.id | product | type | pc_id | b.id | level |
---|---|---|---|---|---|
1000100001 | 2016070500000001 | 1 | 1001 | 1000100001 | a |
1000100001 | 2016070500000001 | 1 | 1002 | 1000100001 | a |
1000100001 | 2016070500000001 | 1 | 1003 | 1000100001 | c |
1000100001 | 2014080100000001 | 2 | 1001 | 1000100001 | a |
1000100001 | 2014080100000001 | 2 | 1002 | 1000100001 | a |
1000100001 | 2014080100000001 | 2 | 1003 | 1000100001 | c |
1000100001 | 2017033000000001 | 1 | 1001 | 1000100001 | a |
1000100001 | 2017033000000001 | 1 | 1002 | 1000100001 | a |
1000100001 | 2017033000000001 | 1 | 1003 | 1000100001 | c |
4、请写出运行结果:select count(*) from a full join b on a.id=b.id
12
pc_id | b.id | level | a.id | product | type |
---|---|---|---|---|---|
1001 | 1000100001 | a | 1000100001 | 2016070500000001 | 1 |
1001 | 1000100001 | a | 1000100001 | 2014080100000001 | 2 |
1001 | 1000100001 | a | 1000100001 | 2017033000000001 | 1 |
1002 | 1000100001 | a | 1000100001 | 2016070500000001 | 1 |
1002 | 1000100001 | a | 1000100001 | 2014080100000001 | 2 |
1002 | 1000100001 | a | 1000100001 | 2017033000000001 | 1 |
1003 | 1000100001 | c | 1000100001 | 2016070500000001 | 1 |
1003 | 1000100001 | c | 1000100001 | 2014080100000001 | 2 |
1003 | 1000100001 | c | 1000100001 | 2017033000000001 | 1 |
1004 | 1000100002 | g | |||
1005 | 1000100002 | b | |||
1006 | 1000100003 | d |
参考建表sql语句
create table a (id number ,product number,type number);
insert into a values(1000100001, 2016070500000001, 1);
insert into a values(1000100001, 2014080100000001, 2);
insert into a values(1000100001, 2017033000000001, 1);
commit;
create table b (pc_id number, id number, level_1 varchar2 (20));
insert into b values (1001, 1000100001, 'a');
insert into b values (1002, 1000100001, 'a');
insert into b values (1003, 1000100001, 'c');
insert into b values (1004, 1000100002, 'g');
insert into b values (1005, 1000100002, 'b');
insert into b values (1006, 1000100003, 'd');
commit;
4、t_prod
t_prod
cust_id | prod | amt | cnt |
---|---|---|---|
客户号 | 产品 | 金额 | 个数 |
1000100001 | a | 100 | 2 |
1000100001 | b | 200 | 1 |
1000100002 | b | 200 | 4 |
1000100003 | a | 100 | 1 |
1000100003 | b | 200 | 2 |
1、查询出同时持有a产品和b产品的客户
--表关联
select t1.cust_id
from t_prod t1 --'a'
inner join t_prod t2 --'b'
on t1.cust_id = t2.cust_id
and t1.prod = 'a'
and t2.prod = 'b';
--exists
select t1.cust_id
from t_prod t1 --'a'
where t1.prod = 'a'
and exists(select 1
from t_prod t2 --'b'
where t2.prod = 'b'
and t1.cust_id = t2.cust_id
);
--in
select t1.cust_id
from t_prod t1
where t1.prod = 'a'
and t1.cust_id in (
select t2.cust_id
from t_prod t2
where t2.prod = 'b'
);
--count()
select t1.cust_id
from t_prod t1
where t1.prod in('a','b')
group by t1.cust_id
having count(1) =2;
2、查询出只持有b产品的客户
--not in
select t1.cust_id
from t_prod t1
where t1.prod = 'b'
and t1.cust_id not in (
select t2.cust_id
from t_prod t2
where t2.prod = 'a'
);
--not exists
select *
from t_prod t
where t.prod = 'b'
and not exists(select t1.cust_id
from t_prod t1
where t1.prod <> 'b'
and t.cust_id = t1.cust_id
);
--先把只有一个产品的客户找到
--再看看他有没有 b 产品
select t1.cust_id
from (
select t.cust_id,
t.prod,
t.amt,
t.cnt
,count(t.prod)over(partition by t.cust_id) as c_p
from t_prod t
) t1
where t1.prod = 'b'
and t1.c_p = 1;
3、通过sql将表1转化为如下的表
cust_id | a | b |
---|---|---|
1000100001 | 200 | 200 |
1000100002 | 800 | |
1000100003 | 100 | 400 |
-- sum-case-when
select t.cust_id
,sum(case when t.prod='a' then t.amt*t.cnt end) as a
,sum(case when t.prod='b' then t.amt*t.cnt end) as b
from t_prod t
group by t.cust_id;
-- pivot
select *
from (
select t1.cust_id
,t1.prod
,t1.amt * t1.cnt as spend
from t_prod t1
) t2
pivot(sum(spend)
for prod in(
'a' as "a"
,'b' as "b"
)
);
参考建表语句
create table t_prod(cust_id number(20)
,prod varchar2(50)
,amt number(20)
,cnt number(20)
);
insert into t_prod values (1000100001 , 'a' , 100 , 2 ) ;
insert into t_prod values (1000100001 , 'b' , 200 , 1 ) ;
insert into t_prod values (1000100002 , 'b' , 200 , 4 ) ;
insert into t_prod values (1000100003 , 'a' , 100 , 1 ) ;
insert into t_prod values (1000100003 , 'b' , 200 , 2 ) ;
insert into t_prod values (1000100004 , 'c' , 100 , 1 ) ;
insert into t_prod values (1000100004 , 'b' , 200 , 2 ) ;
insert into t_prod values (1000100005 , 'c' , 200 , 2 ) ;
commit;