Hive常用函数大全（一）（关系/数学/逻辑/数值/日期/条件/字符串/集合统计/复杂类型）

最新推荐文章于 2024-04-11 22:45:44 发布

sinat_36710456

最新推荐文章于 2024-04-11 22:45:44 发布

阅读量403

点赞数

分类专栏：大数据文章标签： hive

大数据专栏收录该内容

53 篇文章 1 订阅

订阅专栏

测试数据 data.txt 文件

zhangsa   dfsadsa323   new   67.1   2
lisi   543gfd   old   43.32   1
wanger   65ghf   new   88.88 | 3
liiu   fdsfagwe   new   66.0   1
qibaqi   fds   new   54.32   1
wangshi   f332   old   77.77   2
liwei   hfd   old   88.44   3
wutong   543gdfsd   new   56.55   6
lilisi   dsfgg   new   88.88   5
qishili   fds    new   66.66   5

create external table if not exists order_detail(
user_id string,
device_id string,
user_type string,
price double,
sales int
)
row format delimited fields terminated by '\t'
lines terminated by '\n' 
stored as textfile;

# 在hive cli 命令窗口中上传文件到hdfs文件系统 
hive (default)> dfs -put /home/liguodong/data/data.txt /temp/lgd

# 加载文件
hive (default)> load data inpath '/temp/lgd/data.txt'  overwrite into table order_detail;

select * from order_detail;
+----------+-------------+------------+--------+--------+--+
| user_id  |  device_id  | user_type  | price  | sales  |
+----------+-------------+------------+--------+--------+--+
| zhangsa  | dfsadsa323  | new        | 67.1   | 2      |
| lisi     | 543gfd      | old        | 43.32  | 1      |
| wanger   | 65ghf       | new        | 88.88  | 3      |
| liiu     | fdsfagwe    | new        | 66.0   | 1      |
| qibaqiu  | fds         | new        | 54.32  | 1      |
| wangshi  | f332        | old        | 77.77  | 2      |
| liwei    | hfd         | old        | 88.44  | 3      |
| wutong   | 543gdfsd    | new        | 56.55  | 6      |
| lilisi   | dsfgg       | new        | 88.88  | 5      |
| qishili  | fds         | new        | 66.66  | 5      |
+----------+-------------+------------+--------+--------+--+

关系运算

## > < =
##注意： String 的比较要注意(常用的时间比较可以先 to_date 之后再比较)
select long_time>short_time, long_time<short_time,long_time=short_time, to_date(long_time)=to_date(short_time)
from 
(
    select '2017-01-11 00:00:00' as long_time, '2017-01-11' as short_time
    from 
    order_detail limit 1
)bb;

result:
true    false   false   true

## 空值判断
select 1 from order_detail where NULL is NULL limit 1;

## 非空判断
select 1 from order_detail where 1 is not NULL limit 1;

## LIKE
语法: A LIKE B
描述: 字符串A符合表达式B的正则语法,则为TRUE;否则为FALSE. B中字符”_”表示任意单个字符，而字符”%”表示任意数量的字符。

select user_id from order_detail where user_id like 'li%';

+----------+--+
| user_id  |
+----------+--+
| lisi     |
| liiu     |
| liwei    |
| lilisi   |
+----------+--+

select user_id from order_detail where user_id like 'li__';

+----------+--+
| user_id  |
+----------+--+
| lisi     |
| liiu     |
+----------+--+


## RLIKE
语法: A RLIKE B
描述: 字符串A符合JAVA正则表达式 B 的正则语法，则为 TRUE；否则为 FALSE。


select user_id from order_detail where user_id rlike '^l.*i$';

+----------+--+
| user_id  |
+----------+--+
| lisi     |
| liwei    |
| lilisi   |
+----------+--+


select device_id from order_detail where device_id rlike '^\\w+$';

+-------------+--+
|  device_id  |
+-------------+--+
| dfsadsa323  |
| 543gfd      |
| 65ghf       |
| fdsfagwe    |
| fds         |
| f332        |
| hfd         |
| 543gdfsd    |
| dsfgg       |
| fds         |
+-------------+--+

select device_id from order_detail where device_id rlike '^[a-zA-Z]+$';

+------------+--+
| device_id  |
+------------+--+
| fdsfagwe   |
| fds        |
| hfd        |
| dsfgg      |
| fds        |
+------------+--+

select device_id from order_detail where device_id rlike '^[a-zA-Z]{4,}$';

+------------+--+
| device_id  |
+------------+--+
| fdsfagwe   |
| dsfgg      |
+------------+--+


## REGEXP
语法: A REGEXP B
描述: 功能与 RLIKE 相同
select device_id from order_detail where device_id REGEXP '^[a-zA-Z]{4,}$';

+------------+--+
| device_id  |
+------------+--+
| fdsfagwe   |
| dsfgg      |
+------------+--+

数学运算

## + - * / 
注意：
hive中最高精度的数据类型是 double,只精确到小数点后16位，在做除法运算的时候要特别注意

select user_id,100-price,sales-1,price*sales,price/sales,
ceil(28.0/6.999999999999999999999),ceil(28.0/6.99999999999999),
price%sales from order_detail;

+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
| user_id  |         _c1         | _c2  |         _c3         |         _c4         | _c5  | _c6  |         _c7         |
+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
| zhangsa  | 32.900000000000006  | 1    | 134.2               | 33.55               | 4    | 5    | 1.0999999999999943  |
| lisi     | 56.68               | 0    | 43.32               | 43.32               | 4    | 5    | 0.3200000000000003  |
| wanger   | 11.120000000000005  | 2    | 266.64              | 29.626666666666665  | 4    | 5    | 1.8799999999999955  |
| liiu     | 34.0                | 0    | 66.0                | 66.0                | 4    | 5    | 0.0                 |
| qibaqiu  | 45.68               | 0    | 54.32               | 54.32               | 4    | 5    | 0.3200000000000003  |
| wangshi  | 22.230000000000004  | 1    | 155.54              | 38.885              | 4    | 5    | 1.769999999999996   |
| liwei    | 11.560000000000002  | 2    | 265.32              | 29.48               | 4    | 5    | 1.4399999999999977  |
| wutong   | 43.45               | 5    | 339.29999999999995  | 9.424999999999999   | 4    | 5    | 2.549999999999997   |
| lilisi   | 11.120000000000005  | 4    | 444.4               | 17.776              | 4    | 5    | 3.8799999999999955  |
| qishili  | 33.34               | 4    | 333.29999999999995  | 13.331999999999999  | 4    | 5    | 1.6599999999999966  |
+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+

## %
注意：
精度在 hive 中是个很大的问题，类似这样的操作最好通过 round 指定精度
select 8.4 % 4,round(8.4 % 4 , 2) from order_detail limit 1;

+----------------------+------+--+
|         _c0          | _c1  |
+----------------------+------+--+
| 0.40000000000000036  | 0.4  |
+----------------------+------+--+


## 位与& 位或| 位异或^ 位取反~

select 4&6, 8&4, 4|8,6|8,4^8,6^4,~6,~3 from order_detail limit 1;

逻辑运算

## 逻辑与AND 逻辑或OR 逻辑非NOT
注意：优先级一次为NOT AND OR

## 以下两条SQL互斥
select user_id from order_detail where not ((user_id='wanger' or user_id like 'li%') and user_type='old');

+----------+--+
| user_id  |
+----------+--+
| zhangsa  |
| wanger   |
| liiu     |
| qibaqiu  |
| wangshi  |
| wutong   |
| lilisi   |
| qishili  |
+----------+--+

select user_id from order_detail where ((user_id='wanger' or user_id like 'li%') and user_type='old');

+----------+--+
| user_id  |
+----------+--+
| lisi     |
| liwei    |
+----------+--+

数值计算函数

## 取整: round
语法: round(double a)
说明: 遵循四舍五入

## 指定精度取整: round
语法: round(double a, int d)

## 向下取整: floor
说明: 返回等于或者小于该 double 变量的最大的整数

## 向上取整: ceil
说明: 返回等于或者大于该 double 变量的最小的整数

## 向上取整: ceiling
说明: 与ceil功能相同

select user_id,price,round(price),round(price,0),round(price,1),floor(price),ceil(price),ceiling(price) from order_detail;

+----------+--------+-------+-------+-------+------+------+------+--+
| user_id  | price  |  _c2  |  _c3  |  _c4  | _c5  | _c6  | _c7  |
+----------+--------+-------+-------+-------+------+------+------+--+
| zhangsa  | 67.1   | 67.0  | 67.0  | 67.1  | 67   | 68   | 68   |
| lisi     | 43.32  | 43.0  | 43.0  | 43.3  | 43   | 44   | 44   |
| wanger   | 88.88  | 89.0  | 89.0  | 88.9  | 88   | 89   | 89   |
| liiu     | 66.0   | 66.0  | 66.0  | 66.0  | 66   | 66   | 66   |
| qibaqiu  | 54.32  | 54.0  | 54.0  | 54.3  | 54   | 55   | 55   |
| wangshi  | 77.77  | 78.0  | 78.0  | 77.8  | 77   | 78   | 78   |
| liwei    | 88.44  | 88.0  | 88.0  | 88.4  | 88   | 89   | 89   |
| wutong   | 56.55  | 57.0  | 57.0  | 56.6  | 56   | 57   | 57   |
| lilisi   | 88.88  | 89.0  | 89.0  | 88.9  | 88   | 89   | 89   |
| qishili  | 66.66  | 67.0  | 67.0  | 66.7  | 66   | 67   | 67   |
+----------+--------+-------+-------+-------+------+------+------+--+


## 取随机数: rand
说明: 返回一个 0 到 1 范围内的随机数。如果指定种子 seed(整数)，则会得到一个稳定的随机数序列。

## 自然指数: exp  自然对数: ln

select user_id,sales,price,rand(),rand(sales),exp(sales),ln(price) from order_detail;

+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
| user_id  | sales  | price  |         _c3          |         _c4          |         _c5         |         _c6         |
+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
| zhangsa  | 2      | 67.1   | 0.708066911383928    | 0.7311469360199058   | 7.38905609893065    | 4.206184043977636   |
| lisi     | 1      | 43.32  | 0.509552420396184    | 0.9014476240300544   | 2.718281828459045   | 3.76861442213279    |
| wanger   | 3      | 88.88  | 0.2462642074705902   | 0.49682259343089075  | 20.085536923187668  | 4.487287145331375   |
| liiu     | 1      | 66.0   | 0.720800913818751    | 0.9858769332362016   | 2.718281828459045   | 4.189654742026425   |
| qibaqiu  | 1      | 54.32  | 0.7358273250797408   | 0.8571240443456863   | 2.718281828459045   | 3.9948924832504407  |
| wangshi  | 2      | 77.77  | 0.11092554405907218  | 0.9874208338984266   | 7.38905609893065    | 4.353755752706852   |
| liwei    | 3      | 88.44  | 0.5162574691353392   | 0.2281579303734177   | 20.085536923187668  | 4.482324355989245   |
| wutong   | 6      | 56.55  | 0.2753658209591686   | 0.07479382813444624  | 403.4287934927351   | 4.03512520256213    |
| lilisi   | 5      | 88.88  | 0.25777632824045826  | 0.7431577182910525   | 148.4131591025766   | 4.487287145331375   |
| qishili  | 5      | 66.66  | 0.06419187859857822  | 0.9495832704567262   | 148.4131591025766   | 4.199605072879594   |
+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+


## 以10为底对数: log10  以2为底对数: log2 

## 对数: log
语法: log(double base, double a)
说明: 返回以 base 为底的 a 的对数

select log10(100),log2(8),log(4,256) from order_detail limit 1;

+------+------+------+--+
| _c0  | _c1  | _c2  |
+------+------+------+--+
| 2.0  | 3.0  | 4.0  |
+------+------+------+--+

## 幂运算: pow power  开平方: sqrt

select pow(2,4), power(2,4),sqrt(16) from order_detail limit 1;

+-------+-------+------+--+
|  _c0  |  _c1  | _c2  |
+-------+-------+------+--+
| 16.0  | 16.0  | 4.0  |
+-------+-------+------+--+

## 二进制: bin  十六进制: hex  反转十六进制: unhex  

## 进制转换: conv
语法: conv(BIGINT num, int from_base, int to_base)
说明: 将数值 num 从 from_base 进制转化到 to_base 进制

select bin(7),hex('19'),hex('abc'),unhex('616263'),unhex('41'),conv(17,10,16),conv(17,10,2) from order_detail limit 1;

+------+-------+---------+------+------+------+--------+--+
| _c0  |  _c1  |   _c2   | _c3  | _c4  | _c5  |  _c6   |
+------+-------+---------+------+------+------+--------+--+
| 111  | 3139  | 616263  | abc  | A    | 11   | 10001  |
+------+-------+---------+------+------+------+--------+--+

## 绝对值:abs  正取余:pmod  正弦:sin  反正弦:asin  余弦:cos  反余弦:acos  返回A的值:positive  返回A的相反数:negative

select abs(-13),abs(10.10),pmod(9,4),pmod(-9,4),
sin(0.8),asin(0.7173560908995228),cos(0.9), acos(0.6216099682706644),
positive(-10),negative(-10)
from order_detail limit 1;

+------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
| _c0  |  _c1  | _c2  | _c3  |         _c4         | _c5  |         _c6         | _c7  | _c8  | _c9  |
+------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
| 13   | 10.1  | 1    | 3    | 0.7173560908995228  | 0.8  | 0.6216099682706644  | 0.9  | -10  | 10   |
+------+-------+------+------+---------------------+------+---------------------+------+------+------+--+

日期函数

## UNIX时间戳转日期: from_unixtime

## 日期转UNIX时间戳,指定格式日期转UNIX 时间戳,获取当前UNIX时间戳: unix_timestamp
说明: 转换格式为"yyyy-MM-dd HH:mm:ss"的日期到 UNIX 时间戳。如果转化失败，则返回 0。

select 
    from_unixtime(1323308943),
    from_unixtime(1323308943,'yyyyMMdd'),
    unix_timestamp(),
    unix_timestamp('2017-12-07 16:01:03'),
    unix_timestamp('20171207 16-01-03','yyyyMMdd HH-mm-ss')
from 
order_detail limit 1;
+----------------------+-----------+-------------+-------------+-------------+--+
|         _c0          |    _c1    |     _c2     |     _c3     |     _c4     |
+----------------------+-----------+-------------+-------------+-------------+--+
| 2011-12-08 09:49:03  | 20111208  | 1489029488  | 1512633663  | 1512633663  |
+----------------------+-----------+-------------+-------------+-------------+--+

## 日期时间转日期:to_date  日期转年:year 日期转月:month 日期转天:day 日期转小时:hour  日期转分钟:minute  日期转秒:second

select
to_date('2016-12-08 10:03:01'),
year('2016-12-08 10:03:01'),
month('2016-12-08'),
day('2016-12-08 10:03:01'),
hour('2016-12-08 10:03:01'),
minute('2016-12-08 10:03:01'),
second('2016-12-08 10:03:01')
from 
order_detail limit 1;
+-------------+-------+------+------+------+------+------+--+
|     _c0     |  _c1  | _c2  | _c3  | _c4  | _c5  | _c6  |
+-------------+-------+------+------+------+------+------+--+
| 2016-12-08  | 2016  | 12   | 8    | 10   | 3    | 1    |
+-------------+-------+------+------+------+------+------+--+


## 日期转周:weekofyear  日期比较:datediff
select 
weekofyear('2016-12-08 10:03:01'),
datediff('2016-12-08','2016-11-27') 
from order_detail limit 1;

+------+------+--+
| _c0  | _c1  |
+------+------+--+
| 49   | 11   |
+------+------+--+


## 日期增加: date_add 日期减少: date_sub
select date_add('2016-12-08',10),date_add('2016-12-08',-10),
date_sub('2016-12-08',-10),date_sub('2016-12-08',10) from order_detail limit 1;

+-------------+-------------+-------------+-------------+--+
|     _c0     |     _c1     |     _c2     |     _c3     |
+-------------+-------------+-------------+-------------+--+
| 2016-12-18  | 2016-11-28  | 2016-12-18  | 2016-11-28  |
+-------------+-------------+-------------+-------------+--+

select 
date_add('20161208',10),
from_unixtime(unix_timestamp(date_add('2016-12-08',10)),'yyyyMMdd'),
from_unixtime(unix_timestamp(date_add('2016-12-08',10),'yyyy-MM-dd'),'yyyyMMdd') 
from order_detail limit 1;

+-------+-------+-----------+--+
|  _c0  |  _c1  |    _c2    |
+-------+-------+-----------+--+
| NULL  | NULL  | 20161218  |
+-------+-------+-----------+--+

条件函数

## IF CASE COALESCE
说明: COALESCE返回参数中的第一个非空值；如果所有值都为 NULL，那么返回 NULL

select user_id,device_id,user_type,sales,
if(user_type='new',user_id,'***'), 
COALESCE(null,user_id,device_id,user_type),
COALESCE(null,null,device_id,user_type),
case user_type when 'new' then 'new_user' when 'old' then 'old_user' 
else 'others' end,
case when user_type='new' and sales>=5 then 'gold_user' when user_type='old' and sales<3 then 'bronze_user' 
else 'silver_user' end
from order_detail;


+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| user_id  |  device_id  | user_type  | sales  |   _c4    |   _c5    |     _c6     |    _c7    |     _c8      |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| zhangsa  | dfsadsa323  | new        | 2      | zhangsa  | zhangsa  | dfsadsa323  | new_user  | silver_user  |
| lisi     | 543gfd      | old        | 1      | ***      | lisi     | 543gfd      | old_user  | bronze_user  |
| wanger   | 65ghf       | new        | 3      | wanger   | wanger   | 65ghf       | new_user  | silver_user  |
| liiu     | fdsfagwe    | new        | 1      | liiu     | liiu     | fdsfagwe    | new_user  | silver_user  |
| qibaqiu  | fds         | new        | 1      | qibaqiu  | qibaqiu  | fds         | new_user  | silver_user  |
| wangshi  | f332        | old        | 2      | ***      | wangshi  | f332        | old_user  | bronze_user  |
| liwei    | hfd         | old        | 3      | ***      | liwei    | hfd         | old_user  | silver_user  |
| wutong   | 543gdfsd    | new        | 6      | wutong   | wutong   | 543gdfsd    | new_user  | gold_user    |
| lilisi   | dsfgg       | new        | 5      | lilisi   | lilisi   | dsfgg       | new_user  | gold_user    |
| qishili  | fds         | new        | 5      | qishili  | qishili  | fds         | new_user  | gold_user    |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+

字符串函数


## 字符串长度:length 字符串反转:reverse 字符串连接:concat 带分隔符字符串连接:concat_ws

select 
user_id,device_id,user_type,length(user_id),reverse(user_id),
concat(user_id,device_id,user_type),concat_ws('_',user_id,device_id,user_type)
from order_detail;


+----------+-------------+------------+------+----------+-----------------------+-------------------------+--+
| user_id  |  device_id  | user_type  | _c3  |   _c4    |          _c5          |           _c6           |
+----------+-------------+------------+------+----------+-----------------------+-------------------------+--+
| zhangsa  | dfsadsa323  | new        | 7    | asgnahz  | zhangsadfsadsa323new  | zhangsa_dfsadsa323_new  |
| lisi     | 543gfd      | old        | 4    | isil     | lisi543gfdold         | lisi_543gfd_old         |
| wanger   | 65ghf       | new        | 6    | regnaw   | wanger65ghfnew        | wanger_65ghf_new        |
| liiu     | fdsfagwe    | new        | 4    | uiil     | liiufdsfagwenew       | liiu_fdsfagwe_new       |
| qibaqiu  | fds         | new        | 7    | uiqabiq  | qibaqiufdsnew         | qibaqiu_fds_new         |
| wangshi  | f332        | old        | 7    | ihsgnaw  | wangshif332old        | wangshi_f332_old        |
| liwei    | hfd         | old        | 5    | iewil    | liweihfdold           | liwei_hfd_old           |
| wutong   | 543gdfsd    | new        | 6    | gnotuw   | wutong543gdfsdnew     | wutong_543gdfsd_new     |
| lilisi   | dsfgg       | new        | 6    | isilil   | lilisidsfggnew        | lilisi_dsfgg_new        |
| qishili  | fds         | new        | 7    | ilihsiq  | qishilifdsnew         | qishili_fds_new         |


## 字符串截取函数： substr,substring
语法: substr(string A, int start),substring(string A, int start)
说明：返回字符串 A 从 start 位置到结尾的字符串

语法: substr(string A, int start, int len),substring(string A, int start, int len)
说明：返回字符串A从start位置开始，长度为len的字符串

select 
user_id,substr(user_id,3),substr(user_id,-2),
substring(user_id,1,2),substr(user_id,-2,2)
from order_detail;

+----------+--------+------+------+------+--+
| user_id  |  _c1   | _c2  | _c3  | _c4  |
+----------+--------+------+------+------+--+
| zhangsa  | angsa  | sa   | zh   | sa   |
| lisi     | si     | si   | li   | si   |
| wanger   | nger   | er   | wa   | er   |
| liiu     | iu     | iu   | li   | iu   |
| qibaqiu  | baqiu  | iu   | qi   | iu   |
| wangshi  | ngshi  | hi   | wa   | hi   |
| liwei    | wei    | ei   | li   | ei   |
| wutong   | tong   | ng   | wu   | ng   |
| lilisi   | lisi   | si   | li   | si   |
| qishili  | shili  | li   | qi   | li   |
+----------+--------+------+------+------+--+


## 字符串转大写:upper,ucase  字符串转小写:lower,lcase

select user_id,upper(user_id),ucase(user_id),
lower(upper(user_id)),lcase(ucase(user_id)),lower(ucase(user_id))
from order_detail;

+----------+----------+----------+----------+----------+----------+--+
| user_id  |   _c1    |   _c2    |   _c3    |   _c4    |   _c5    |
+----------+----------+----------+----------+----------+----------+--+
| zhangsa  | ZHANGSA  | ZHANGSA  | zhangsa  | zhangsa  | zhangsa  |
| lisi     | LISI     | LISI     | lisi     | lisi     | lisi     |
| wanger   | WANGER   | WANGER   | wanger   | wanger   | wanger   |
| liiu     | LIIU     | LIIU     | liiu     | liiu     | liiu     |
| qibaqiu  | QIBAQIU  | QIBAQIU  | qibaqiu  | qibaqiu  | qibaqiu  |
| wangshi  | WANGSHI  | WANGSHI  | wangshi  | wangshi  | wangshi  |
| liwei    | LIWEI    | LIWEI    | liwei    | liwei    | liwei    |
| wutong   | WUTONG   | WUTONG   | wutong   | wutong   | wutong   |
| lilisi   | LILISI   | LILISI   | lilisi   | lilisi   | lilisi   |
| qishili  | QISHILI  | QISHILI  | qishili  | qishili  | qishili  |
+----------+----------+----------+----------+----------+----------+--+

## 去两边的空格:trim  左边去空格:ltrim  右边去空格:rtrim

select trim(' abc '),ltrim(' abc'),rtrim('abc ') from order_detail limit 1;

+------+------+------+--+
| _c0  | _c1  | _c2  |
+------+------+------+--+
| abc  | abc  | abc  |
+------+------+------+--+


## 正则表达式替换: regexp_replace 
说明：将字符串 A 中的符合 java 正则表达式 B 的部分替换为 C。注意，在有些情况下要使用转义字符,
类似 oracle 中的 regexp_replace 函数。

## 正则表达式解析: regexp_extract
将字符串 subject 按照 pattern 正则表达式的规则拆分，返回 index 指定的字符。
注意，在有些情况下要使用转义字符，如等号要用双竖线转义，这是java正则表达式的规则。


select user_id,regexp_replace(user_id, 'li|ng', '**'),
regexp_extract(user_id,'li(.*?)(si)',1),
regexp_extract(user_id,'li(.*?)(si)',2),
regexp_extract(user_id,'li(.*?)(si)',0)
from order_detail;
+----------+----------+------+------+---------+--+
| user_id  |   _c1    | _c2  | _c3  |   _c4   |
+----------+----------+------+------+---------+--+
| zhangsa  | zha**sa  |      |      |         |
| lisi     | **si     |      | si   | lisi    |
| wanger   | wa**er   |      |      |         |
| liiu     | **iu     |      |      |         |
| qibaqiu  | qibaqiu  |      |      |         |
| wangshi  | wa**shi  |      |      |         |
| liwei    | **wei    |      |      |         |
| wutong   | wuto**   |      |      |         |
| lilisi   | ****si   | li   | si   | lilisi  |
| qishili  | qishi**  |      |      |         |
+----------+----------+------+------+---------+--+


select 
regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k1\\=([^&]+)', 1),
regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k2\\=([^#]+)', 1)
from order_detail limit 1;

+-----------+--------+--+
|    _c0    |  _c1   |
+-----------+--------+--+
| v1543643  | v3245  |
+-----------+--------+--+


## URL解析:parse_url
语法: parse_url(string urlString, string partToExtract [, string keyToExtract])
说明：返回 URL 中指定的部分。
partToExtract 的有效值为： HOST, PATH, QUERY, REF,PROTOCOL, AUTHORITY, FILE, and USERINFO.
举例：

select 
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k2'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE')
from order_detail limit 1;

+---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
|      _c0      |      _c1      |     _c2      | _c3  |  _c4  |  _c5  |      _c6      |            _c7            |
+---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
| facebook.com  | /path1/p.php  | k1=v1&k2=v2  | v2   | Ref1  | http  | facebook.com  | /path1/p.php?k1=v1&k2=v2  |
+---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+


## json解析:
get_json_object
语法: get_json_object(string json_string, string path)
说明：解析 json 的字符串 json_string,返回 path 指定的内容。如果输入的 json 字符串无效，那么返回 NULL。

select 
get_json_object(
'{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}',
'$.owner'),
get_json_object(
'{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}',
'$.store.fruit[0].type')
from order_detail limit 1;

+------+--------+--+
| _c0  |  _c1   |
+------+--------+--+
| amy  | apple  |
+------+--------+--+


## json_tuple
语法: json_tuple(string jsonStr,string k1,string k2, ...)
参数为一组键k1，k2……和JSON字符串，返回值的元组。该方法比 get_json_object 高效，因为可以在一次调用中输入多个键.

select a.user_id, b.*
from order_detail a 
lateral view 
json_tuple('{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}', 'email', 'owner') b as email, owner limit 1;

+----------+---------------------------------+--------+--+
| user_id  |              email              | owner  |
+----------+---------------------------------+--------+--+
| zhangsa  | amy@only_for_json_udf_test.net  | amy    |
+----------+---------------------------------+--------+--+

## parse_url_tuple

SELECT b.*
from 
(
select 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' as urlstr
from
order_detail 
limit 1
)a 
LATERAL VIEW 
parse_url_tuple(a.urlstr, 'HOST', 'PATH', 'QUERY', 'QUERY:k1') b as host, path, query, query_k1 LIMIT 1;

+---------------+---------------+--------------+-----------+--+
|     host      |     path      |    query     | query_k1  |
+---------------+---------------+--------------+-----------+--+
| facebook.com  | /path1/p.php  | k1=v1&k2=v2  | v1        |
+---------------+---------------+--------------+-----------+--+


空格字符串:space   重复字符串:repeat 首字符ascii:ascii 

select space(10), length(space(10)), repeat('abc',5), ascii('abcde')
from order_detail 
limit 1;

+-------------+------+------------------+------+--+
|     _c0     | _c1  |       _c2        | _c3  |
+-------------+------+------------------+------+--+
|             | 10   | abcabcabcabcabc  | 97   |
+-------------+------+------------------+------+--+


左补足函数:lpad  右补足函数:rpad 
语法: lpad(string str, int len, string pad)
说明：lpad将 str 进行用 pad 进行左补足到 len 位, rpad将 str 进行用 pad 进行右补足到 len 位
注意：与 GP,ORACLE 不同; pad不能默认

select lpad('abc',10,'td'),rpad('abc',10,'td') 
from order_detail 
limit 1;

+-------------+-------------+--+
|     _c0     |     _c1     |
+-------------+-------------+--+
| tdtdtdtabc  | abctdtdtdt  |
+-------------+-------------+--+

分割字符串函数: split

集合查找函数: find_in_set
语法: find_in_set(string str, string strList)
说明: 返回 str 在 strlist 第一次出现的位置， strlist 是用逗号分割的字符串。如果没有找该 str 字符，则返回 0


select 
split('abtcdtef','t'),
find_in_set('ab','ef,ab,de'),
find_in_set('at','ef,ab,de') 
from order_detail 
limit 1;
+-------------------+------+------+--+
|        _c0        | _c1  | _c2  |
+-------------------+------+------+--+
| ["ab","cd","ef"]  | 2    | 0    |
+-------------------+------+------+--+

## string转map：str_to_map 
语法：str_to_map(text[, delimiter1, delimiter2])
说明：使用两个分隔符将文本拆分为键值对。 Delimiter1将文本分成K-V对，Delimiter2分割每个K-V对。
对于delimiter1默认分隔符是'，'，对于delimiter2默认分隔符是':'。
select str_to_map('aaa:11&bbb:22', '&', ':')
from order_detail limit 1;
+--------------------------+--+
|           _c0            |
+--------------------------+--+
| {"bbb":"22","aaa":"11"}  |
+--------------------------+--+

select str_to_map('aaa:11&bbb:22', '&', ':')['aaa']
from order_detail limit 1;

+------+--+
| _c0  |
+------+--+
| 11   |
+------+--+

select str_to_map('aaa:11,bbb:22')
from person limit 1;

{"bbb":"22","aaa":"11"}

集合统计函数

## 个数统计:count  总和统计:sum
语法: count(*), count(expr), count(DISTINCT expr[, expr_.])
说明: 
count(*)统计检索出的行的个数，包括 NULL 值的行； 
count(expr)返回指定字段的非空值的个数； 
count(DISTINCT expr[, expr_.])返回指定字段的不同的非空值的个数

语法: sum(col), sum(DISTINCT col)
说明: 
sum(col)统计结果集中 col 的相加的结果； sum(DISTINCT col)统计结果中 col 不同值

select 
count(*),count(user_type),count(distinct user_type),
sum(sales),sum(distinct sales)
from order_detail; 

+------+------+------+------+------+--+
| _c0  | _c1  | _c2  | _c3  | _c4  |
+------+------+------+------+------+--+
| 10   | 10   | 2    | 29   | 17   |
+------+------+------+------+------+--+

## 平均值统计:avg  最小值统计:min  最大值统计:max

select 
avg(sales),avg(distinct sales),min(sales),max(distinct sales)
from order_detail;

+------+------+------+------+--+
| _c0  | _c1  | _c2  | _c3  |
+------+------+------+------+--+
| 2.9  | 3.4  | 1    | 6    |
+------+------+------+------+--+


## 标准差：stddev_samp, stddev, stddev_pop
stddev_pop <==> stddev

## 方差：var_samp, var_pop

当我们需要真实的标准差/方差的时候最好是使用： stddev stddev_pop  var_pop
而只是需要得到少量数据的标准差/方差的近似值可以选用： stddev_samp var_samp

select 
var_pop(sales),var_samp(sales),
stddev_pop(sales),stddev(sales),stddev_samp(sales), 
pow(stddev_pop(sales),2),pow(stddev_samp(sales),2)
from order_detail;

+-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
|  _c0  |        _c1         |         _c2         |         _c3         |         _c4         |         _c5         |         _c6         |
+-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
| 3.09  | 3.433333333333333  | 1.7578395831246945  | 1.7578395831246945  | 1.8529256146249728  | 3.0899999999999994  | 3.4333333333333336  |
+-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+


## 百分位数: percentile 近似百分位数: percentile_approx 直方图: histogram_numeric

语法: percentile_approx(DOUBLE col, p [, B])
返回值: double
说明: 求近似的第 pth 个百分位数， p 必须介于 0 和 1 之间，返回类型为 double，但是col 字段支持浮点类型。
参数 B 控制内存消耗的近似精度， B越大，结果的准确度越高。默认为 10,000。
当 col 字段中的 distinct 值的个数小于 B 时，结果为准确的百分位数


select 
percentile(sales,0.2),
percentile_approx(sales,0.5,10000),
histogram_numeric(sales,5)
from order_detail;

+------+------+----------------------------------------------------------------------------------------------+--+
| _c0  | _c1  |                                             _c2                                              |
+------+------+----------------------------------------------------------------------------------------------+--+
| 1.0  | 2.0  | [{"x":1.0,"y":3.0},{"x":2.0,"y":2.0},{"x":3.0,"y":2.0},{"x":5.0,"y":2.0},{"x":6.0,"y":1.0}]  |
+------+------+----------------------------------------------------------------------------------------------+--+

select 
inline(histogram_numeric(sales,10))
from order_detail;

+------+------+--+
|  x   |  y   |
+------+------+--+
| 1.0  | 3.0  |
| 2.0  | 2.0  |
| 3.0  | 2.0  |
| 5.0  | 2.0  |
| 6.0  | 1.0  |
+------+------+--+

### 后面可以输入多个百分位数，返回类型也为 array<double>，其中为对应的百分位数。
select 
percentile(sales,array(0.2,0.4,0.6)),
percentile_approx(sales,array(0.2,0.4,0.6),10000)
from order_detail;

+----------------+----------------+--+
|      _c0       |      _c1       |
+----------------+----------------+--+
| [1.0,2.0,3.0]  | [1.0,1.5,2.5]  |
+----------------+----------------+--+

复杂类型访问操作及统计函数

测试数据集：
tony    1338    hello,woddd     1,2     a1,a2,a3        k1:1.0,k2:2.0,k3:3.0    s1,s2,s3,4
mark    5453    kke,ladyg       2,3     a4,a5,a6        k4:4.0,k5:5.0,k2:6.0    s4,s5,s6,6
ivyfd   4323    aa,thq,dsx      3,6     a7,a8,a9        k7:7.0,k8:8.0,k2:9.0    s7,s8,s9,9
drop table employees;
create external table if not exists employees(
name string,
salary string,
happy_word string,
happy_num array<int>,
subordinates array<string>,
deductions map<string,float>,
address struct<street:string,city:string,state:string,zip:int>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile;

hdfs dfs -put /home/liguodong/data/muldata.txt /temp/lgd

load data inpath '/temp/lgd/muldata.txt' overwrite into table employees;

select * from employees;

Getting log thread is interrupted, since query is done!
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
|  name  | salary  |  happy_word  | happy_num  |   subordinates    |          deductions           |                      address                      |
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
| tony   | 1338    | hello,woddd  | [1,2]      | ["a1","a2","a3"]  | {"k1":1.0,"k2":2.0,"k3":3.0}  | {"street":"s1","city":"s2","state":"s3","zip":4}  |
| mark   | 5453    | kke,ladyg    | [2,3]      | ["a4","a5","a6"]  | {"k4":4.0,"k5":5.0,"k2":6.0}  | {"street":"s4","city":"s5","state":"s6","zip":6}  |
| ivyfd  | 4323    | aa,thq,dsx   | [3,6]      | ["a7","a8","a9"]  | {"k7":7.0,"k8":8.0,"k2":9.0}  | {"street":"s7","city":"s8","state":"s9","zip":9}  |
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+

## 访问数组 Map 结构体

select 
name,salary,
subordinates[1],deductions['k2'],deductions['k3'],address.city 
from employees;
+--------+---------+------+------+-------+-------+--+
|  name  | salary  | _c2  | _c3  |  _c4  | city  |
+--------+---------+------+------+-------+-------+--+
| tony   | 1338    | a2   | 2.0  | 3.0   | s2    |
| mark   | 5453    | a5   | 6.0  | NULL  | s5    |
| ivyfd  | 4323    | a8   | 9.0  | NULL  | s8    |
+--------+---------+------+------+-------+-------+--+

## Map类型长度  Array类型长度

select size(deductions),size(subordinates) from employees limit 1;

+------+------+--+
| _c0  | _c1  |
+------+------+--+
| 3    | 3    |
+------+------+--+

## 类型转换: cast

select cast(salary as int),cast(deductions['k2'] as bigint) from employees;

+---------+------+--+
| salary  | _c1  |
+---------+------+--+
| 1338    | 2    |
| 5453    | 6    |
| 4323    | 9    |
+---------+------+--+

### LATERAL VIEW 行转列
SELECT 
name, ad_subordinate
FROM employees 
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate;
+--------+-----------------+--+
|  name  | ad_subordinate  |
+--------+-----------------+--+
| tony   | a1              |
| tony   | a2              |
| tony   | a3              |
| mark   | a4              |
| mark   | a5              |
| mark   | a6              |
| ivyfd  | a7              |
| ivyfd  | a8              |
| ivyfd  | a9              |
+--------+-----------------+--+

SELECT 
name, count(1)
FROM employees 
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate
group by name;
+--------+------+--+
|  name  | _c1  |
+--------+------+--+
| ivyfd  | 3    |
| mark   | 3    |
| tony   | 3    |
+--------+------+--+

SELECT ad_subordinate, ad_num 
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate
LATERAL VIEW explode(happy_num) addTable2 AS ad_num;
+-----------------+---------+--+
| ad_subordinate  | ad_num  |
+-----------------+---------+--+
| a1              | 1       |
| a1              | 2       |
| a2              | 1       |
| a2              | 2       |
| a3              | 1       |
| a3              | 2       |
| a4              | 2       |
| a4              | 3       |
| a5              | 2       |
| a5              | 3       |
| a6              | 2       |
| a6              | 3       |
| a7              | 3       |
| a7              | 6       |
| a8              | 3       |
| a8              | 6       |
| a9              | 3       |
| a9              | 6       |
+-----------------+---------+--+

### 多个LATERAL VIEW
SELECT 
name, count(1) 
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate
LATERAL VIEW explode(happy_num) addTable2 AS ad_num
group by name;

+--------+------+--+
|  name  | _c1  |
+--------+------+--+
| ivyfd  | 6    |
| mark   | 6    |
| tony   | 6    |
+--------+------+--+

### 不满足条件产生空行
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW 
explode(array()) BB AS a limit 10;
+-------+----+--+
| name  | a  |
+-------+----+--+
+-------+----+--+

### OUTER 避免永远不产生结果，无满足条件的行，在该列会产生NULL值。
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW 
OUTER explode(array()) BB AS a limit 10;
+--------+-------+--+
|  name  |   a   |
+--------+-------+--+
| tony   | NULL  |
| mark   | NULL  |
| ivyfd  | NULL  |
+--------+-------+--+

### 字符串切分成多列
SELECT 
name, word
FROM employees
LATERAL VIEW explode(split(happy_word,',')) addTable AS word;

+--------+--------+--+
|  name  |  word  |
+--------+--------+--+
| tony   | hello  |
| tony   | woddd  |
| mark   | kke    |
| mark   | ladyg  |
| ivyfd  | aa     |
| ivyfd  | thq    |
| ivyfd  | dsx    |
+--------+--------+--+

转载：https://blog.csdn.net/scgaliguodong123_/article/details/60881166