HiveSQL(Hive SQL)是Apache Hive所使用的SQL方言,专门用于在Hadoop上进行大规模数据处理。以下是一些常见的HiveSQL函数及其使用方法:
1. 聚合函数
COUNT
统计记录数。
SELECT COUNT(*) FROM employees;
SUM
计算数值列的总和。
SELECT SUM(salary) FROM employees;
AVG
计算数值列的平均值。
SELECT AVG(salary) FROM employees;
MAX
返回数值列的最大值。
SELECT MAX(salary) FROM employees;
MIN
返回数值列的最小值。
SELECT MIN(salary) FROM employees;
2. 字符串函数
CONCAT
连接两个或多个字符串。
SELECT CONCAT(first_name, ' ', last_name) AS full_name FROM employees;
SUBSTR
截取字符串的子串。
SELECT SUBSTR(first_name, 1, 3) AS short_name FROM employees;
LENGTH
返回字符串的长度。
SELECT LENGTH(first_name) AS name_length FROM employees;
UPPER
将字符串转换为大写。
SELECT UPPER(first_name) AS upper_name FROM employees;
LOWER
将字符串转换为小写。
SELECT LOWER(first_name) AS lower_name FROM employees;
3. 数学函数
ABS
返回数值的绝对值。
SELECT ABS(salary) AS abs_salary FROM employees;
CEIL
返回不小于指定数值的最小整数。
SELECT CEIL(salary) AS ceil_salary FROM employees;
FLOOR
返回不大于指定数值的最大整数。
SELECT FLOOR(salary) AS floor_salary FROM employees;
ROUND
将数值四舍五入到指定的小数位数。
SELECT ROUND(salary, 2) AS rounded_salary FROM employees;
4. 日期函数
CURRENT_DATE
返回当前日期。
SELECT CURRENT_DATE AS today FROM employees;
CURRENT_TIMESTAMP
返回当前时间戳。
SELECT CURRENT_TIMESTAMP AS now FROM employees;
DATEDIFF
计算两个日期之间的天数差。
SELECT DATEDIFF('2024-12-31', '2024-01-01') AS days_diff FROM employees;
DATE_ADD
在指定日期上加上指定的天数。
SELECT DATE_ADD('2024-01-01', 10) AS new_date FROM employees;
DATE_SUB
在指定日期上减去指定的天数。
SELECT DATE_SUB('2024-01-01', 10) AS new_date FROM employees;
YEAR, MONTH, DAY
分别提取日期的年份、月份和天。
SELECT YEAR(hire_date) AS hire_year, MONTH(hire_date) AS hire_month, DAY(hire_date) AS hire_day FROM employees;
5. 条件函数
IF
根据条件返回不同的值。
SELECT IF(salary > 5000, 'High', 'Low') AS salary_level FROM employees;
CASE
多条件判断,类似于SQL中的CASE WHEN。
SELECT
CASE
WHEN salary > 7000 THEN 'Very High'
WHEN salary > 5000 THEN 'High'
ELSE 'Low'
END AS salary_level
FROM employees;
6. 复杂类型函数
SIZE
返回数组或映射的大小。
SELECT SIZE(skill_set) AS skill_count FROM employees;
MAP_KEYS
返回映射的所有键。
SELECT MAP_KEYS(employee_info) AS keys FROM employees;
MAP_VALUES
返回映射的所有值。
SELECT MAP_VALUES(employee_info) AS values FROM employees;
ARRAY_CONTAINS
检查数组中是否包含指定元素。
SELECT ARRAY_CONTAINS(skill_set, 'Java') AS has_java_skill FROM employees;
7. JSON函数
GET_JSON_OBJECT
从JSON字符串中提取指定的字段。
SELECT GET_JSON_OBJECT(employee_json, '$.name') AS employee_name FROM employees;
JSON_TUPLE
将JSON字符串解析为多个列。
SELECT
JSON_TUPLE(employee_json, 'name', 'age', 'department')
FROM employees;
8. 窗口函数
ROW_NUMBER
为结果集中的每一行分配唯一的行号。
SELECT
employee_id,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS row_num
FROM employees;
RANK
为结果集中的每一行分配排名,排名相等的行具有相同的排名。
SELECT
employee_id,
RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
FROM employees;
DENSE_RANK
为结果集中的每一行分配密集排名,排名相等的行具有相同的排名,但没有空缺排名。
SELECT
employee_id,
DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS dense_rank
FROM employees;
9. 其他常用函数
COALESCE
返回第一个非NULL的值。
SELECT COALESCE(middle_name, first_name, 'N/A') AS name FROM employees;
NVL
如果第一个参数为NULL,则返回第二个参数,否则返回第一个参数。
SELECT NVL(middle_name, 'N/A') AS middle_name FROM employees;
REGEXP_EXTRACT
使用正则表达式从字符串中提取子字符串。
SELECT REGEXP_EXTRACT(email, '([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+)', 1) AS email_address FROM employees;
SPLIT
将字符串按指定分隔符分割成数组。
SELECT SPLIT(full_name, ' ') AS name_parts FROM employees;