HiveQL: 查询

最新推荐文章于 2022-09-15 22:20:17 发布

hxhh

最新推荐文章于 2022-09-15 22:20:17 发布

阅读量991

点赞数 1

分类专栏： Hive 文章标签： Hive 查询

本文链接：https://blog.csdn.net/qq403580298/article/details/84976364

版权

Hive 专栏收录该内容

15 篇文章 1 订阅

订阅专栏

–创表语句

create table employees (
	name		string,
	salary		float,
	subordinates array<string>,
	deductions 	map<string,float>,
	address		struct<street:string, city:string ,statee:string, zip:int>
)
partitioned by (country string, state string);

–查询

select name, salary from employees;

–查询2(带命名)

select e.name, e.salary from employees e;

–查询带array的列

select name, subordinates from employees;

-->输出
John Doe	["mary Smith","Todd Jones"]
Mary Smith	["Bill King"]
Todd Jones	[]
Bill King 	[]

–查询带Map的列 (JSON格式)

select name, deductions from employees;
--> 输出
John Doe	["Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1]
Mary Smith	["Federal Taxes":0.2,"State Taxes":0.05,"Insurance":0.1]
Todd Jones	["Federal Taxes":0.15,"State Taxes":0.03,"Insurance":0.1]
Bill King 	["Federal Taxes":0.15,"State Taxes":0.03,"Insurance":0.1]

–查询带struct的列 (JSON格式)

select name, address from employees;
--> 输出
John Doe {"street":"1Michgan Ave","city":"Chicago","state":"1L","zip":60600}
Mary Smith {"street":"12Michgan Ave","city":"Chicago","state":"1L","zip":60601}
Todd Jones {"street":"13Michgan Ave","city":"Chicago","state":"1L","zip":60700}
Bill King {"street":"14Michgan Ave","city":"Chicago","state":"1L","zip":60100}

–查询列为数组，数组中的第一个元素（不存在的返回为NULL）

select name, deductions[0] from employees;
--> 输出
John Doe	mary Smith
Mary Smith	Bill King
Todd Jones	NULL
Bill King 	NULL

–查询列为Map，取Map某key的value值

select name, deductions["Federal Taxes"] from employees;
-->输出
John Doe	0.05
Mary Smith	0.05
Todd Jones	0.03
Bill King 	0.03

–查询列为stuct,使用. 来取值

select name, address.city from employees;
-->输出
John Doe	Chicago
Mary Smith	Chicago
Todd Jones	Oak Park
Bill King 	Obscuria

–使用正则表达式来制定列 (下面的查询会从表stocks中选择,symbol和所有列名以price作为前缀的列；)

select symbol , `price.*` from stocks;
-->输出
AAPL	195.69  179.14 ..略
AAPL	195.69  179.14 ..略
AAPL	195.69  179.14 ..略

–使用列值进行计算(可使用函数)

select upper(name), salary, deductions["Federal Taxes"],
round(salary * (1 - deductions["Federal Taxes"])) from employees;
-->输出
John Doe	100000.0	0.2		80000
Mary Smith	 80000.0	0.2		64000
Todd Jones	 70000.0	0.15	59500
Bill King 	 60000.0	0.15	51000

–算术运算符(略)
–使用函数(略)

–limit语句，限制返回行数

select upper(name), salary, deductions["Federal Taxes"],
round(salary * (1 - deductions["Federal Taxes"])) from employees
limit 2;
-->输出
John Doe	100000.0	0.2		80000
Mary Smith	 80000.0	0.2		64000

–列别名(新产生的列需要用别名)

select upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
from employees limit 2;
-->输出
John Doe	100000.0	0.2		80000
Mary Smith	 80000.0	0.2		64000

–嵌套select语句的查询

from (
	select upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
	round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
	from employees 
)e
select e.name, e.salary_minus_fed_taxes
where e.salary_minus_fed_taxes > 700000;
-->输出
John Doe	100000.0	0.2		80000

–CASE … WHEN … THEN 语句(处理单个列的查询结果,结果会多一列)

select name, salary,
	case
		when salary < 50000.0 then 'low'
		when salary >= 50000.0 and salary < 70000.0 then 'middle'
		when salary >= 70000.0 and salary < 100000.0 then 'high' 
		else 'very high'
	end as bracket
from employees;
-->输出
John Doe	100000.0 very high
Mary Smith   80000.0 high
Todd Jones   70000.0 high
Bill King    60000.0 middel
....

–开启本地模式，系统会尝试使用本地模式(效率↑)

set hive.exec.mode.local.auto=true;

–where语句(selec筛选字段，where过滤字段)
–选出美国的加利佛尼亚的：

select * from employees
where country = 'US' and state = 'CA';

–薪资大于70000的

select upper(name), salary, deductions["Federal Taxes"],
round(salary * (1 - deductions["Federal Taxes"])) from employees
where round(salary * (1 - deductions["Federal Taxes"])) > 70000;

**–薪资大于70000的(使用别名) 但是where不可以使用列别用，要使用一个"表.列别名"来调用

select e.* from 
(
	select upper(name), salary, deductions["Federal Taxes"] as ded,
	round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
	from employees
)e
where round(e.salary_minus_fed_taxes) > 70000;

–谓词操作符 A<>B
A或者B为null则返回NULL;

–浮点数比较(避免精度问题造成错误)

select name, salary, deductions['Federal Taxes'] 
from employees
where deductions['Federal Taxes'] > cast(0.2 as float);

–like的谓词使用 (开头)

select name, address.street 
from employees
where address.street like '%Ave.';

–like的谓词使用 (结尾)

select name, address.street 
from employees
where address.street like 'O%';

–like的谓词使用 (中间包含)

select name, address.street 
from employees
where address.street like '%Chi%';

–Rlike + 正则表达式1 (街道含有Chicago或者Ontario的)

select name, address.street
from employees
where address.street RLIKE '.*(Chicago|Ontario).*';

–Rlike + 正则表达式2 (街道含有Chicago或者Ontario的)

select name, address.street
from employees
where address.street like '%Chicago%' or address.street like '%Ontario%';

–group by (通常会和聚合函数一起使用)
–(按照年份对股票进行分组，然后计算每年的平均收盘价)

select year(ymd), avg(price_close) from stocks
where exchange = 'NASDAQ' and symbol='AAPL'
group by year(ymd);

–having (对group by语句产生的分组进行条件过滤)

select year(ymd), avg(price_close) from stocks
where exchange = 'NASDAQ' and symbol='AAPL'
group by year(ymd)
having avg(price_close) > 50.0;

--↑ 如果不使用having 则需要嵌套一个select去完成(不建议使用)

select s2.year s2.avg from
( 
	select year(ymd) as year,avg(price_close) as avg 
	from stocks where exchange = 'NASDAQ' and symbol='APPL'
	group by year(ymd)
) s2
where s2.avg > 50.0;

–join
–inner join = join 内连接 (只有进行连接的两个表中都存在于连接标准相匹配的数据才会保留下来)

–查询对苹果公司的股价和IBM公司的股价进行比较。(自连接)

select a.ymd, a.price_close, b.price_close
from stocks a join stocks b on a.ymd = b.ymd
where a.symbol = 'AAPL' and b.symbol = 'IBM'
-->输出
2010-01-04	214.01	132.45
...

–非自连接的inner join (多表join)

select a.ymd, a.price_close, b.price_close, c.price_close
from stocks a join stocks b on a.ymd = b.ymd 
			  join stocks c on a.ymd = c.ymd
where a.symbol = 'AAPL' and b.symbol = 'IBM' and c.symbol = 'GE';

–join优化
1.小表放在连接的左侧，依次增大。

–left outer join 左外连接
–(join操作左边表中符合where子句的所有记录将会被返回，join操作符右边表中如果没有符合on后面连接条件的记录时，那么从右边表指定的选择的列的值，将会是NULL)

select s.ymd, s.symbol, s.price_close, d.dividend
from stocks s left outer join dividends d on s.ymd = d.ymd and s.symbol = d.symbol
where s.symbol = 'AAPL';
-->输出
1987-05-01	AAPL	80.0	NULL
1987-05-05	AAPL	79.75	NULL
1987-05-11	AAPL	80.25	0.015
......

–outer join

select s.ymd, s.symbol, s.price_close, d.dividend
from stocks s left outer join dividends d on s.ymd = d.ymd and s.symbol = y.symbol
where s.symbol = 'AAPL' and s.exchange = 'NASDAQ';

–left outer join 外连接会忽略掉分区过滤条件 (使用嵌套select来实现 outer join) ?

select s.ymd, s.symbol, s.price_close, d.dividend from
(select * from stocks where symbol = 'AAPL' and exchange = 'NASDAQ') s
left outer join 
(select * from dividends where symbol = 'AAPL' and exchange = 'NASDAQ') d
on s.ymd = d.ymd;

**–知识点: SQL会先执行join，再执行where

–Right outer join (右外链接，会返回右边表所有符合WHERE语的记录，左表中撇配不上的字段值用NULL代替)

select s.ymd, s.symbol, s.price_close, d.dividend 
from dividends d RIGHT OUTER JOIN stocks s On d.ymd = s.ymd AND d.symbol = s.symbol
where s.symbol = 'AAPL' ;

–full outer join (完全外链接：讲会返回表中符合where语句条件的所有记录)

select s.ymd, s.symbol, s.price_close, d.dividend 
from dividends FULL OUTER JOIN stocks s on d.ymd = s.ymd and d.symbol = s.symbol
where s.symbol = 'AAPL';

–返回限定的股息支付日内的股票交易记录，不过这个查询hive是不支持的
–hive不支持这种IN的查询(X)，需要使用下面的代替

select s.ymd, s.symbol, s.price_close 
from stocks s
where s.ymd = s.symbol in
(select d.ymd, d.symbol from dividends d);

–left semi-join (与left join效果一样，只是更加高效)
–(左半开连接：会返回左表的记录，前提是其记录对于右边表满足ON语句中的判定条件)

select s.ymd, s.symbol, s.price_close, d.dividend
from stocks s left SEMI JOIN dividends d on s.ymd = d.ymd and s.symbol = d.symbol;

–笛卡尔积JOIN (是一种连接，表示左边的行数乘以右表的行数等于笛卡尔积的结果大小)
–就是没有条件(即没有on)的内连接

select * from stocks join dividends;

–map-side Join 略
–order by (对查询的结果进行一个全局的排序)

select s.ymd, s.symbol, s.price_close
from stocks s
order by s.ymd ASC, s.symbol desc;

–sort by

select s.ymd, s.symbol, s.price_close
from stocks s
order by s.ymd asc, s.symbol desc;

–sort by 和 distribute by

select s.ymd, s.symbol, s.price_close
from stocks s
distrubute by s.symbol  -- 用来保证具有相同股票交易码的记录会分发到同一个reducer
sort by s.symbol ASC, s.ymd ASC;

–cluster by

select s.ymd, s.symbol, s.price_close
from stocks s
cluster by s.symbol;  --相当于合并distrubute by 和 sort by

–类型转换 (强烈推荐，round()和floor()函数)

select name, salary, from employees
where cast(salary as float) < 1000000;

–值类型转换binary值

select (2.0*cast(cast(b as string) as double)) from src;

–抽样查询(略？)
–数据块抽样(略？)
–分桶表的输入剪裁(略？)

–union all(可以将2个或者多个表合并)(每个子查询必须具有相同的列，而且对应的每个字段的字段类型必须是一致的)

select log.ymd, log.level, log.message
from(
	select l1.ymd, l1.level, l1.message, 'Log1' as source
	from log1 l1
	UNION ALL 
	select l2.ymd, l2.level, l2.message, 'Log2' as source
	from log1 l2
) log
sort by log.ymd asc;

–参考《Hive编程指南》