SELECT … FROM Clauses
- hive> SELECT name, salary FROM employees;
表别名
- hive> SELECT name, salary FROM employees;
- hive> SELECT e.name, e.salary FROM employees e;
Specify Columns with Regular Expressions
- hive> SELECT symbol, `price.*` FROM stocks;
- AAPL 195.69 197.88 194.0 194.12 194.12
- AAPL 192.63 196.0 190.85 195.46 195.46
- AAPL 196.73 198.37 191.57 192.05 192.05
- AAPL 195.17 200.2 194.42 199.23 199.23
- AAPL 195.91 196.32 193.38 195.86 195.86
Computing with Column Values
- hive> SELECT upper(name), salary, deductions["Federal Taxes"],
- > round(salary * (1 - deductions["Federal Taxes"])) FROM employees;
- JOHN DOE 100000.0 0.2 80000
- MARY SMITH 80000.0 0.2 64000
- TODD JONES 70000.0 0.15 59500
- BILL KING 60000.0 0.15 51000
Arithmetic Operators --算数运算
Mathematical functions
Aggregate functions -- 集合函数
- hive> SET hive.map.aggr=true;
- hive> SELECT count(*), avg(salary) FROM employees;
Table generating functions
- hive> SELECT explode(subordinates) AS sub FROM employees;
- Mary Smith
- Todd Jones
- Bill King
Other built-in functions
LIMIT Clause --限制行数
- hive> SELECT upper(name), salary, deductions["Federal Taxes"],
- > round(salary * (1 - deductions["Federal Taxes"])) FROM employees
- > LIMIT 2;
- JOHN DOE 100000.0 0.2 80000
- MARY SMITH 80000.0 0.2 64000
Column Aliases --列别名
- hive> SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
- > round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
- > FROM employees LIMIT 2;
- JOHN DOE 100000.0 0.2 80000
- MARY SMITH 80000.0 0.2 64000
Nested SELECT Statements --子查询
- hive> FROM (
- > SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
- > round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
- > FROM employees
- > ) e
- > SELECT e.name, e.salary_minus_fed_taxes
- > WHERE e.salary_minus_fed_taxes > 70000;
- JOHN DOE 100000.0 0.2 80000
CASE … WHEN … THEN Statements --case 关键字
- hive> SELECT name, salary,
- > CASE
- > WHEN salary < 50000.0 THEN 'low'
- > WHEN salary >= 50000.0 AND salary < 70000.0 THEN 'middle'
- > WHEN salary >= 70000.0 AND salary < 100000.0 THEN 'high'
- > ELSE 'very high'
- > END AS bracket FROM employees;
- John Doe 100000.0 very high
- Mary Smith 80000.0 high
- Todd Jones 70000.0 high
- Bill King 60000.0 middle
- Boss Man 200000.0 very high
- Fred Finance 150000.0 very high
- Stacy Accountant 60000.0 middle
WHERE Clauses -- 筛选
- SELECT * FROM employees
- WHERE country = 'US' AND state = 'CA';
- hive> SELECT name, salary, deductions["Federal Taxes"],
- > salary * (1 - deductions["Federal Taxes"])
- > FROM employees
- > WHERE round(salary * (1 - deductions["Federal Taxes"])) > 70000;
- John Doe 100000.0 0.2 80000.0
- hive> SELECT name, salary, deductions["Federal Taxes"],
- > salary * (1 - deductions["Federal Taxes"]) as salary_minus_fed_taxes
- > FROM employees
- > WHERE round(salary_minus_fed_taxes) > 70000;
- FAILED: Error in semantic analysis: Line 4:13 Invalid table alias or
- column reference 'salary_minus_fed_taxes': (possible column names are:
- name, salary, subordinates, deductions, address)
- hive> SELECT e.* FROM
- > (SELECT name, salary, deductions["Federal Taxes"] as ded,
- > salary * (1 - deductions["Federal Taxes"]) as salary_minus_fed_taxes
- > FROM employees) e
- > WHERE round(e.salary_minus_fed_taxes) > 70000;
- John Doe 100000.0 0.2 80000.0
- Boss Man 200000.0 0.3 140000.0
- Fred Finance 150000.0 0.3 105000.0
Predicate Operators
LIKE and RLIKE
- hive> SELECT name, address.street FROM employees WHERE address.street LIKE '%Ave.';
- John Doe 1 Michigan Ave.
- Todd Jones 200 Chicago Ave.
- hive> SELECT name, address.city FROM employees WHERE address.city LIKE 'O%';
- Todd Jones Oak Park
- Bill King Obscuria
- hive> SELECT name, address.street FROM employees WHERE address.street LIKE '%Chi%';
- Todd Jones 200 Chicago Ave.
- hive> SELECT name, address.street
- > FROM employees WHERE address.street RLIKE '.*(Chicago|Ontario).*';
- Mary Smith 100 Ontario St.
- Todd Jones 200 Chicago Ave.
- SELECT name, address FROM employees
- WHERE address.street LIKE '%Chicago%' OR address.street LIKE '%Ontario%';
GROUP BY Clauses
- hive> SELECT year(ymd), avg(price_close) FROM stocks
- > WHERE exchange = 'NASDAQ' AND symbol = 'AAPL'
- > GROUP BY year(ymd);
- 1984 25.578625440597534
- 1985 20.193676221040867
- 1986 32.46102808021274
- 1987 53.88968399108163
- 1988 41.540079275138766
- 1989 41.65976212516664
- 1990 37.56268799823263
- 1991 52.49553383386182
- 1992 54.80338610251119
- 1993 41.02671956450572
- 1994 34.0813495847914
HAVING Clauses
- hive> SELECT year(ymd), avg(price_close) FROM stocks
- > WHERE exchange = 'NASDAQ' AND symbol = 'AAPL'
- > GROUP BY year(ymd)
- > HAVING avg(price_close) > 50.0;
- 1987 53.88968399108163
- 1991 52.49553383386182
- 1992 54.80338610251119
- 1999 57.77071460844979
- 2000 71.74892876261757
- 2005 52.401745992993554
Inner JOIN
- hive> SELECT a.ymd, a.price_close, b.price_close
- > FROM stocks a JOIN stocks b ON a.ymd = b.ymd
- > WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM';
- 2010-01-04 214.01 132.45
- 2010-01-05 214.38 130.85
- 2010-01-06 210.97 130.0
- 2010-01-07 210.58 129.55
- 2010-01-08 211.98 130.85
- 2010-01-11 210.11 129.48
Example 6-1. Query that will not work in Hive
- SELECT a.ymd, a.price_close, b.price_close
- FROM stocks a JOIN stocks b
- ON a.ymd <= b.ymd
- WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM';
LEFT OUTER JOIN
- hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
- > FROM stocks s LEFT OUTER JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol
- > WHERE s.symbol = 'AAPL';
- ...
- 1987-05-01 AAPL 80.0 NULL
- 1987-05-04 AAPL 79.75 NULL
- 1987-05-05 AAPL 80.25 NULL
- 1987-05-06 AAPL 80.0 NULL
- 1987-05-07 AAPL 80.25 NULL
- 1987-05-08 AAPL 79.0 NULL
- 1987-05-11 AAPL 77.0 0.015
- 1987-05-12 AAPL 75.5 NULL
- 1987-05-13 AAPL 78.5 NULL
- 1987-05-14 AAPL 79.25 NULL
- 1987-05-15 AAPL 78.25 NULL
- 1987-05-18 AAPL 75.75 NULL
- 1987-05-19 AAPL 73.25 NULL
- 1987-05-20 AAPL 74.5 NULL
- ...
OUTER JOIN Gotcha
- hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
- > FROM stocks s LEFT OUTER JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol
- > WHERE s.symbol = 'AAPL'
- > AND s.exchange = 'NASDAQ' AND d.exchange = 'NASDAQ';
- 1987-05-11 AAPL 77.0 0.015
- 1987-08-10 AAPL 48.25 0.015
- 1987-11-17 AAPL 35.0 0.02
- 1988-02-12 AAPL 41.0 0.02
- 1988-05-16 AAPL 41.25 0.02
- ...
- hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend FROM
- > (SELECT * FROM stocks WHERE symbol = 'AAPL' AND exchange = 'NASDAQ') s
- > LEFT OUTER JOIN
- > (SELECT * FROM dividends WHERE symbol = 'AAPL' AND exchange = 'NASDAQ') d
- > ON s.ymd = d.ymd;
- ...
- 1988-02-10 AAPL 41.0 NULL
- 1988-02-11 AAPL 40.63 NULL
- 1988-02-12 AAPL 41.0 0.02
- 1988-02-16 AAPL 41.25 NULL
- 1988-02-17 AAPL 41.88 NULL
RIGHT OUTER JOIN
- hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
- > FROM dividends d RIGHT OUTER JOIN stocks s ON d.ymd = s.ymd AND d.symbol = s.symbol
- > WHERE s.symbol = 'AAPL';
- ...
- 1987-05-07 AAPL 80.25 NULL
- 1987-05-08 AAPL 79.0 NULL
- 1987-05-11 AAPL 77.0 0.015
- 1987-05-12 AAPL 75.5 NULL
- 1987-05-13 AAPL 78.5 NULL
FULL OUTER JOIN
- hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
- > FROM dividends d FULL OUTER JOIN stocks s ON d.ymd = s.ymd AND d.symbol = s.symbol
- > WHERE s.symbol = 'AAPL';
- ...
- 1987-05-07 AAPL 80.25 NULL
- 1987-05-08 AAPL 79.0 NULL
- 1987-05-11 AAPL 77.0 0.015
- 1987-05-12 AAPL 75.5 NULL
- 1987-05-13 AAPL 78.5 NULL
- ...
LEFT SEMI-JOIN
Example 6-2. Query that will not work in Hive
- SELECT s.ymd, s.symbol, s.price_close FROM stocks s
- WHERE s.ymd, s.symbol IN
- (SELECT d.ymd, d.symbol FROM dividends d);
Instead, you use the following LEFT SEMI JOIN syntax:
- hive> SELECT s.ymd, s.symbol, s.price_close
- > FROM stocks s LEFT SEMI JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol;
- ...
- 1962-11-05 IBM 361.5
- 1962-08-07 IBM 373.25
- 1962-05-08 IBM 459.5
- 1962-02-06 IBM 551.5
ORDER BY and SORT BY
Here is an example using ORDER BY:
- SELECT s.ymd, s.symbol, s.price_close
- FROM stocks s
- ORDER BY s.ymd ASC, s.symbol DESC;
Here is the same example using SORT BY instead:
- SELECT s.ymd, s.symbol, s.price_cl
- FROM stocks s
- SORT BY s.ymd ASC, s.symbol DESC;
Casting
- SELECT name, salary FROM employees
- WHERE cast(salary AS FLOAT) < 100000.0;
Queries that Sample Data -- 抽样
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
- 2
- 4
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
- 7
- 10
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON number) s;
- 2
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 5 OUT OF 10 ON number) s;
- 4
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON number) s;
- 2
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 1 OUT OF 2 ON number) s;
- 2
- 4
- 6
- 8
- 10
- hive> SELECT * from numbers TABLESAMPLE(BUCKET 2 OUT OF 2 ON number) s;
- 1
- 3
- 5
- 7
- 9
UNION ALL
- SELECT log.ymd, log.level, log.message
- FROM (
- SELECT l1.ymd, l1.level,
- l1.message, 'Log1' AS source
- FROM log1 l1
- UNION ALL
- SELECT l2.ymd, l2.level,
- l2.message, 'Log2' AS source
- FROM log1 l2
- ) log
- SORT BY log.ymd ASC;
- FROM (
- FROM src SELECT src.key, src.value WHERE src.key < 100
- UNION ALL
- FROM src SELECT src.* WHERE src.key > 110
- ) unioninput
- INSERT OVERWRITE DIRECTORY '/tmp/union.out' SELECT unioninput.*