测试数据 data.txt 文件
zhangsa dfsadsa323 new 67.1 2 lisi 543gfd old 43.32 1 wanger 65ghf new 88.88 | 3 liiu fdsfagwe new 66.0 1 qibaqi fds new 54.32 1 wangshi f332 old 77.77 2 liwei hfd old 88.44 3 wutong 543gdfsd new 56.55 6 lilisi dsfgg new 88.88 5 qishili fds new 66.66 5 |
create external table if not exists order_detail(
user_id string,
device_id string,
user_type string,
price double,
sales int
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;
# 在hive cli 命令窗口中上传文件到hdfs文件系统
hive (default)> dfs -put /home/liguodong/data/data.txt /temp/lgd
# 加载文件
hive (default)> load data inpath '/temp/lgd/data.txt' overwrite into table order_detail;
select * from order_detail;
+----------+-------------+------------+--------+--------+--+
| user_id | device_id | user_type | price | sales |
+----------+-------------+------------+--------+--------+--+
| zhangsa | dfsadsa323 | new | 67.1 | 2 |
| lisi | 543gfd | old | 43.32 | 1 |
| wanger | 65ghf | new | 88.88 | 3 |
| liiu | fdsfagwe | new | 66.0 | 1 |
| qibaqiu | fds | new | 54.32 | 1 |
| wangshi | f332 | old | 77.77 | 2 |
| liwei | hfd | old | 88.44 | 3 |
| wutong | 543gdfsd | new | 56.55 | 6 |
| lilisi | dsfgg | new | 88.88 | 5 |
| qishili | fds | new | 66.66 | 5 |
+----------+-------------+------------+--------+--------+--+
关系运算
## > < =
##注意: String 的比较要注意(常用的时间比较可以先 to_date 之后再比较)
select long_time>short_time, long_time<short_time,long_time=short_time, to_date(long_time)=to_date(short_time)
from
(
select '2017-01-11 00:00:00' as long_time, '2017-01-11' as short_time
from
order_detail limit 1
)bb;
result:
true false false true
## 空值判断
select 1 from order_detail where NULL is NULL limit 1;
## 非空判断
select 1 from order_detail where 1 is not NULL limit 1;
## LIKE
语法: A LIKE B
描述: 字符串A符合表达式B的正则语法,则为TRUE;否则为FALSE. B中字符”_”表示任意单个字符,而字符”%”表示任意数量的字符。
select user_id from order_detail where user_id like 'li%';
+----------+--+
| user_id |
+----------+--+
| lisi |
| liiu |
| liwei |
| lilisi |
+----------+--+
select user_id from order_detail where user_id like 'li__';
+----------+--+
| user_id |
+----------+--+
| lisi |
| liiu |
+----------+--+
## RLIKE
语法: A RLIKE B
描述: 字符串A符合JAVA正则表达式 B 的正则语法,则为 TRUE;否则为 FALSE。
select user_id from order_detail where user_id rlike '^l.*i$';
+----------+--+
| user_id |
+----------+--+
| lisi |
| liwei |
| lilisi |
+----------+--+
select device_id from order_detail where device_id rlike '^\\w+$';
+-------------+--+
| device_id |
+-------------+--+
| dfsadsa323 |
| 543gfd |
| 65ghf |
| fdsfagwe |
| fds |
| f332 |
| hfd |
| 543gdfsd |
| dsfgg |
| fds |
+-------------+--+
select device_id from order_detail where device_id rlike '^[a-zA-Z]+$';
+------------+--+
| device_id |
+------------+--+
| fdsfagwe |
| fds |
| hfd |
| dsfgg |
| fds |
+------------+--+
select device_id from order_detail where device_id rlike '^[a-zA-Z]{4,}$';
+------------+--+
| device_id |
+------------+--+
| fdsfagwe |
| dsfgg |
+------------+--+
## REGEXP
语法: A REGEXP B
描述: 功能与 RLIKE 相同
select device_id from order_detail where device_id REGEXP '^[a-zA-Z]{4,}$';
+------------+--+
| device_id |
+------------+--+
| fdsfagwe |
| dsfgg |
+------------+--+
数学运算
## + - * /
注意:
hive中最高精度的数据类型是 double,只精确到小数点后16位,在做除法运算的时候要特别注意
select user_id,100-price,sales-1,price*sales,price/sales,
ceil(28.0/6.999999999999999999999),ceil(28.0/6.99999999999999),
price%sales from order_detail;
+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
| user_id | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 |
+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
| zhangsa | 32.900000000000006 | 1 | 134.2 | 33.55 | 4 | 5 | 1.0999999999999943 |
| lisi | 56.68 | 0 | 43.32 | 43.32 | 4 | 5 | 0.3200000000000003 |
| wanger | 11.120000000000005 | 2 | 266.64 | 29.626666666666665 | 4 | 5 | 1.8799999999999955 |
| liiu | 34.0 | 0 | 66.0 | 66.0 | 4 | 5 | 0.0 |
| qibaqiu | 45.68 | 0 | 54.32 | 54.32 | 4 | 5 | 0.3200000000000003 |
| wangshi | 22.230000000000004 | 1 | 155.54 | 38.885 | 4 | 5 | 1.769999999999996 |
| liwei | 11.560000000000002 | 2 | 265.32 | 29.48 | 4 | 5 | 1.4399999999999977 |
| wutong | 43.45 | 5 | 339.29999999999995 | 9.424999999999999 | 4 | 5 | 2.549999999999997 |
| lilisi | 11.120000000000005 | 4 | 444.4 | 17.776 | 4 | 5 | 3.8799999999999955 |
| qishili | 33.34 | 4 | 333.29999999999995 | 13.331999999999999 | 4 | 5 | 1.6599999999999966 |
+----------+---------------------+------+---------------------+---------------------+------+------+---------------------+--+
## %
注意:
精度在 hive 中是个很大的问题,类似这样的操作最好通过 round 指定精度
select 8.4 % 4,round(8.4 % 4 , 2) from order_detail limit 1;
+----------------------+------+--+
| _c0 | _c1 |
+----------------------+------+--+
| 0.40000000000000036 | 0.4 |
+----------------------+------+--+
## 位与& 位或| 位异或^ 位取反~
select 4&6, 8&4, 4|8,6|8,4^8,6^4,~6,~3 from order_detail limit 1;
逻辑运算
## 逻辑与AND 逻辑或OR 逻辑非NOT
注意:优先级一次为NOT AND OR
## 以下两条SQL互斥
select user_id from order_detail where not ((user_id='wanger' or user_id like 'li%') and user_type='old');
+----------+--+
| user_id |
+----------+--+
| zhangsa |
| wanger |
| liiu |
| qibaqiu |
| wangshi |
| wutong |
| lilisi |
| qishili |
+----------+--+
select user_id from order_detail where ((user_id='wanger' or user_id like 'li%') and user_type='old');
+----------+--+
| user_id |
+----------+--+
| lisi |
| liwei |
+----------+--+
数值计算函数
## 取整: round
语法: round(double a)
说明: 遵循四舍五入
## 指定精度取整: round
语法: round(double a, int d)
## 向下取整: floor
说明: 返回等于或者小于该 double 变量的最大的整数
## 向上取整: ceil
说明: 返回等于或者大于该 double 变量的最小的整数
## 向上取整: ceiling
说明: 与ceil功能相同
select user_id,price,round(price),round(price,0),round(price,1),floor(price),ceil(price),ceiling(price) from order_detail;
+----------+--------+-------+-------+-------+------+------+------+--+
| user_id | price | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 |
+----------+--------+-------+-------+-------+------+------+------+--+
| zhangsa | 67.1 | 67.0 | 67.0 | 67.1 | 67 | 68 | 68 |
| lisi | 43.32 | 43.0 | 43.0 | 43.3 | 43 | 44 | 44 |
| wanger | 88.88 | 89.0 | 89.0 | 88.9 | 88 | 89 | 89 |
| liiu | 66.0 | 66.0 | 66.0 | 66.0 | 66 | 66 | 66 |
| qibaqiu | 54.32 | 54.0 | 54.0 | 54.3 | 54 | 55 | 55 |
| wangshi | 77.77 | 78.0 | 78.0 | 77.8 | 77 | 78 | 78 |
| liwei | 88.44 | 88.0 | 88.0 | 88.4 | 88 | 89 | 89 |
| wutong | 56.55 | 57.0 | 57.0 | 56.6 | 56 | 57 | 57 |
| lilisi | 88.88 | 89.0 | 89.0 | 88.9 | 88 | 89 | 89 |
| qishili | 66.66 | 67.0 | 67.0 | 66.7 | 66 | 67 | 67 |
+----------+--------+-------+-------+-------+------+------+------+--+
## 取随机数: rand
说明: 返回一个 0 到 1 范围内的随机数。如果指定种子 seed(整数),则会得到一个稳定的随机数序列。
## 自然指数: exp 自然对数: ln
select user_id,sales,price,rand(),rand(sales),exp(sales),ln(price) from order_detail;
+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
| user_id | sales | price | _c3 | _c4 | _c5 | _c6 |
+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
| zhangsa | 2 | 67.1 | 0.708066911383928 | 0.7311469360199058 | 7.38905609893065 | 4.206184043977636 |
| lisi | 1 | 43.32 | 0.509552420396184 | 0.9014476240300544 | 2.718281828459045 | 3.76861442213279 |
| wanger | 3 | 88.88 | 0.2462642074705902 | 0.49682259343089075 | 20.085536923187668 | 4.487287145331375 |
| liiu | 1 | 66.0 | 0.720800913818751 | 0.9858769332362016 | 2.718281828459045 | 4.189654742026425 |
| qibaqiu | 1 | 54.32 | 0.7358273250797408 | 0.8571240443456863 | 2.718281828459045 | 3.9948924832504407 |
| wangshi | 2 | 77.77 | 0.11092554405907218 | 0.9874208338984266 | 7.38905609893065 | 4.353755752706852 |
| liwei | 3 | 88.44 | 0.5162574691353392 | 0.2281579303734177 | 20.085536923187668 | 4.482324355989245 |
| wutong | 6 | 56.55 | 0.2753658209591686 | 0.07479382813444624 | 403.4287934927351 | 4.03512520256213 |
| lilisi | 5 | 88.88 | 0.25777632824045826 | 0.7431577182910525 | 148.4131591025766 | 4.487287145331375 |
| qishili | 5 | 66.66 | 0.06419187859857822 | 0.9495832704567262 | 148.4131591025766 | 4.199605072879594 |
+----------+--------+--------+----------------------+----------------------+---------------------+---------------------+--+
## 以10为底对数: log10 以2为底对数: log2
## 对数: log
语法: log(double base, double a)
说明: 返回以 base 为底的 a 的对数
select log10(100),log2(8),log(4,256) from order_detail limit 1;
+------+------+------+--+
| _c0 | _c1 | _c2 |
+------+------+------+--+
| 2.0 | 3.0 | 4.0 |
+------+------+------+--+
## 幂运算: pow power 开平方: sqrt
select pow(2,4), power(2,4),sqrt(16) from order_detail limit 1;
+-------+-------+------+--+
| _c0 | _c1 | _c2 |
+-------+-------+------+--+
| 16.0 | 16.0 | 4.0 |
+-------+-------+------+--+
## 二进制: bin 十六进制: hex 反转十六进制: unhex
## 进制转换: conv
语法: conv(BIGINT num, int from_base, int to_base)
说明: 将数值 num 从 from_base 进制转化到 to_base 进制
select bin(7),hex('19'),hex('abc'),unhex('616263'),unhex('41'),conv(17,10,16),conv(17,10,2) from order_detail limit 1;
+------+-------+---------+------+------+------+--------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 |
+------+-------+---------+------+------+------+--------+--+
| 111 | 3139 | 616263 | abc | A | 11 | 10001 |
+------+-------+---------+------+------+------+--------+--+
## 绝对值:abs 正取余:pmod 正弦:sin 反正弦:asin 余弦:cos 反余弦:acos 返回A的值:positive 返回A的相反数:negative
select abs(-13),abs(10.10),pmod(9,4),pmod(-9,4),
sin(0.8),asin(0.7173560908995228),cos(0.9), acos(0.6216099682706644),
positive(-10),negative(-10)
from order_detail limit 1;
+------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 | _c8 | _c9 |
+------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
| 13 | 10.1 | 1 | 3 | 0.7173560908995228 | 0.8 | 0.6216099682706644 | 0.9 | -10 | 10 |
+------+-------+------+------+---------------------+------+---------------------+------+------+------+--+
日期函数
## UNIX时间戳转日期: from_unixtime
## 日期转UNIX时间戳,指定格式日期转UNIX 时间戳,获取当前UNIX时间戳: unix_timestamp
说明: 转换格式为"yyyy-MM-dd HH:mm:ss"的日期到 UNIX 时间戳。如果转化失败,则返回 0。
select
from_unixtime(1323308943),
from_unixtime(1323308943,'yyyyMMdd'),
unix_timestamp(),
unix_timestamp('2017-12-07 16:01:03'),
unix_timestamp('20171207 16-01-03','yyyyMMdd HH-mm-ss')
from
order_detail limit 1;
+----------------------+-----------+-------------+-------------+-------------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 |
+----------------------+-----------+-------------+-------------+-------------+--+
| 2011-12-08 09:49:03 | 20111208 | 1489029488 | 1512633663 | 1512633663 |
+----------------------+-----------+-------------+-------------+-------------+--+
## 日期时间转日期:to_date 日期转年:year 日期转月:month 日期转天:day 日期转小时:hour 日期转分钟:minute 日期转秒:second
select
to_date('2016-12-08 10:03:01'),
year('2016-12-08 10:03:01'),
month('2016-12-08'),
day('2016-12-08 10:03:01'),
hour('2016-12-08 10:03:01'),
minute('2016-12-08 10:03:01'),
second('2016-12-08 10:03:01')
from
order_detail limit 1;
+-------------+-------+------+------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 |
+-------------+-------+------+------+------+------+------+--+
| 2016-12-08 | 2016 | 12 | 8 | 10 | 3 | 1 |
+-------------+-------+------+------+------+------+------+--+
## 日期转周:weekofyear 日期比较:datediff
select
weekofyear('2016-12-08 10:03:01'),
datediff('2016-12-08','2016-11-27')
from order_detail limit 1;
+------+------+--+
| _c0 | _c1 |
+------+------+--+
| 49 | 11 |
+------+------+--+
## 日期增加: date_add 日期减少: date_sub
select date_add('2016-12-08',10),date_add('2016-12-08',-10),
date_sub('2016-12-08',-10),date_sub('2016-12-08',10) from order_detail limit 1;
+-------------+-------------+-------------+-------------+--+
| _c0 | _c1 | _c2 | _c3 |
+-------------+-------------+-------------+-------------+--+
| 2016-12-18 | 2016-11-28 | 2016-12-18 | 2016-11-28 |
+-------------+-------------+-------------+-------------+--+
select
date_add('20161208',10),
from_unixtime(unix_timestamp(date_add('2016-12-08',10)),'yyyyMMdd'),
from_unixtime(unix_timestamp(date_add('2016-12-08',10),'yyyy-MM-dd'),'yyyyMMdd')
from order_detail limit 1;
+-------+-------+-----------+--+
| _c0 | _c1 | _c2 |
+-------+-------+-----------+--+
| NULL | NULL | 20161218 |
+-------+-------+-----------+--+
条件函数
## IF CASE COALESCE
说明: COALESCE返回参数中的第一个非空值;如果所有值都为 NULL,那么返回 NULL
select user_id,device_id,user_type,sales,
if(user_type='new',user_id,'***'),
COALESCE(null,user_id,device_id,user_type),
COALESCE(null,null,device_id,user_type),
case user_type when 'new' then 'new_user' when 'old' then 'old_user'
else 'others' end,
case when user_type='new' and sales>=5 then 'gold_user' when user_type='old' and sales<3 then 'bronze_user'
else 'silver_user' end
from order_detail;
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| user_id | device_id | user_type | sales | _c4 | _c5 | _c6 | _c7 | _c8 |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
| zhangsa | dfsadsa323 | new | 2 | zhangsa | zhangsa | dfsadsa323 | new_user | silver_user |
| lisi | 543gfd | old | 1 | *** | lisi | 543gfd | old_user | bronze_user |
| wanger | 65ghf | new | 3 | wanger | wanger | 65ghf | new_user | silver_user |
| liiu | fdsfagwe | new | 1 | liiu | liiu | fdsfagwe | new_user | silver_user |
| qibaqiu | fds | new | 1 | qibaqiu | qibaqiu | fds | new_user | silver_user |
| wangshi | f332 | old | 2 | *** | wangshi | f332 | old_user | bronze_user |
| liwei | hfd | old | 3 | *** | liwei | hfd | old_user | silver_user |
| wutong | 543gdfsd | new | 6 | wutong | wutong | 543gdfsd | new_user | gold_user |
| lilisi | dsfgg | new | 5 | lilisi | lilisi | dsfgg | new_user | gold_user |
| qishili | fds | new | 5 | qishili | qishili | fds | new_user | gold_user |
+----------+-------------+------------+--------+----------+----------+-------------+-----------+--------------+--+
字符串函数
## 字符串长度:length 字符串反转:reverse 字符串连接:concat 带分隔符字符串连接:concat_ws
select
user_id,device_id,user_type,length(user_id),reverse(user_id),
concat(user_id,device_id,user_type),concat_ws('_',user_id,device_id,user_type)
from order_detail;
+----------+-------------+------------+------+----------+-----------------------+-------------------------+--+
| user_id | device_id | user_type | _c3 | _c4 | _c5 | _c6 |
+----------+-------------+------------+------+----------+-----------------------+-------------------------+--+
| zhangsa | dfsadsa323 | new | 7 | asgnahz | zhangsadfsadsa323new | zhangsa_dfsadsa323_new |
| lisi | 543gfd | old | 4 | isil | lisi543gfdold | lisi_543gfd_old |
| wanger | 65ghf | new | 6 | regnaw | wanger65ghfnew | wanger_65ghf_new |
| liiu | fdsfagwe | new | 4 | uiil | liiufdsfagwenew | liiu_fdsfagwe_new |
| qibaqiu | fds | new | 7 | uiqabiq | qibaqiufdsnew | qibaqiu_fds_new |
| wangshi | f332 | old | 7 | ihsgnaw | wangshif332old | wangshi_f332_old |
| liwei | hfd | old | 5 | iewil | liweihfdold | liwei_hfd_old |
| wutong | 543gdfsd | new | 6 | gnotuw | wutong543gdfsdnew | wutong_543gdfsd_new |
| lilisi | dsfgg | new | 6 | isilil | lilisidsfggnew | lilisi_dsfgg_new |
| qishili | fds | new | 7 | ilihsiq | qishilifdsnew | qishili_fds_new |
## 字符串截取函数: substr,substring
语法: substr(string A, int start),substring(string A, int start)
说明:返回字符串 A 从 start 位置到结尾的字符串
语法: substr(string A, int start, int len),substring(string A, int start, int len)
说明:返回字符串A从start位置开始,长度为len的字符串
select
user_id,substr(user_id,3),substr(user_id,-2),
substring(user_id,1,2),substr(user_id,-2,2)
from order_detail;
+----------+--------+------+------+------+--+
| user_id | _c1 | _c2 | _c3 | _c4 |
+----------+--------+------+------+------+--+
| zhangsa | angsa | sa | zh | sa |
| lisi | si | si | li | si |
| wanger | nger | er | wa | er |
| liiu | iu | iu | li | iu |
| qibaqiu | baqiu | iu | qi | iu |
| wangshi | ngshi | hi | wa | hi |
| liwei | wei | ei | li | ei |
| wutong | tong | ng | wu | ng |
| lilisi | lisi | si | li | si |
| qishili | shili | li | qi | li |
+----------+--------+------+------+------+--+
## 字符串转大写:upper,ucase 字符串转小写:lower,lcase
select user_id,upper(user_id),ucase(user_id),
lower(upper(user_id)),lcase(ucase(user_id)),lower(ucase(user_id))
from order_detail;
+----------+----------+----------+----------+----------+----------+--+
| user_id | _c1 | _c2 | _c3 | _c4 | _c5 |
+----------+----------+----------+----------+----------+----------+--+
| zhangsa | ZHANGSA | ZHANGSA | zhangsa | zhangsa | zhangsa |
| lisi | LISI | LISI | lisi | lisi | lisi |
| wanger | WANGER | WANGER | wanger | wanger | wanger |
| liiu | LIIU | LIIU | liiu | liiu | liiu |
| qibaqiu | QIBAQIU | QIBAQIU | qibaqiu | qibaqiu | qibaqiu |
| wangshi | WANGSHI | WANGSHI | wangshi | wangshi | wangshi |
| liwei | LIWEI | LIWEI | liwei | liwei | liwei |
| wutong | WUTONG | WUTONG | wutong | wutong | wutong |
| lilisi | LILISI | LILISI | lilisi | lilisi | lilisi |
| qishili | QISHILI | QISHILI | qishili | qishili | qishili |
+----------+----------+----------+----------+----------+----------+--+
## 去两边的空格:trim 左边去空格:ltrim 右边去空格:rtrim
select trim(' abc '),ltrim(' abc'),rtrim('abc ') from order_detail limit 1;
+------+------+------+--+
| _c0 | _c1 | _c2 |
+------+------+------+--+
| abc | abc | abc |
+------+------+------+--+
## 正则表达式替换: regexp_replace
说明:将字符串 A 中的符合 java 正则表达式 B 的部分替换为 C。注意,在有些情况下要使用转义字符,
类似 oracle 中的 regexp_replace 函数。
## 正则表达式解析: regexp_extract
将字符串 subject 按照 pattern 正则表达式的规则拆分,返回 index 指定的字符。
注意,在有些情况下要使用转义字符,如等号要用双竖线转义,这是java正则表达式的规则。
select user_id,regexp_replace(user_id, 'li|ng', '**'),
regexp_extract(user_id,'li(.*?)(si)',1),
regexp_extract(user_id,'li(.*?)(si)',2),
regexp_extract(user_id,'li(.*?)(si)',0)
from order_detail;
+----------+----------+------+------+---------+--+
| user_id | _c1 | _c2 | _c3 | _c4 |
+----------+----------+------+------+---------+--+
| zhangsa | zha**sa | | | |
| lisi | **si | | si | lisi |
| wanger | wa**er | | | |
| liiu | **iu | | | |
| qibaqiu | qibaqiu | | | |
| wangshi | wa**shi | | | |
| liwei | **wei | | | |
| wutong | wuto** | | | |
| lilisi | ****si | li | si | lilisi |
| qishili | qishi** | | | |
+----------+----------+------+------+---------+--+
select
regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k1\\=([^&]+)', 1),
regexp_extract('http://facebook.com/path1/p.php?k1=v1543643&k2=v3245#Ref1', '.*?k2\\=([^#]+)', 1)
from order_detail limit 1;
+-----------+--------+--+
| _c0 | _c1 |
+-----------+--------+--+
| v1543643 | v3245 |
+-----------+--------+--+
## URL解析:parse_url
语法: parse_url(string urlString, string partToExtract [, string keyToExtract])
说明:返回 URL 中指定的部分。
partToExtract 的有效值为: HOST, PATH, QUERY, REF,PROTOCOL, AUTHORITY, FILE, and USERINFO.
举例:
select
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PATH'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY','k2'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'REF'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'PROTOCOL'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'AUTHORITY'),
parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'FILE')
from order_detail limit 1;
+---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 | _c7 |
+---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
| facebook.com | /path1/p.php | k1=v1&k2=v2 | v2 | Ref1 | http | facebook.com | /path1/p.php?k1=v1&k2=v2 |
+---------------+---------------+--------------+------+-------+-------+---------------+---------------------------+--+
## json解析:
get_json_object
语法: get_json_object(string json_string, string path)
说明:解析 json 的字符串 json_string,返回 path 指定的内容。如果输入的 json 字符串无效,那么返回 NULL。
select
get_json_object(
'{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}',
'$.owner'),
get_json_object(
'{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}',
'$.store.fruit[0].type')
from order_detail limit 1;
+------+--------+--+
| _c0 | _c1 |
+------+--------+--+
| amy | apple |
+------+--------+--+
## json_tuple
语法: json_tuple(string jsonStr,string k1,string k2, ...)
参数为一组键k1,k2……和JSON字符串,返回值的元组。该方法比 get_json_object 高效,因为可以在一次调用中输入多个键.
select a.user_id, b.*
from order_detail a
lateral view
json_tuple('{"store":
{"fruit":\[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
"bicycle":{"price":19.95,"color":"red"}
},
"email":"amy@only_for_json_udf_test.net",
"owner":"amy"
}', 'email', 'owner') b as email, owner limit 1;
+----------+---------------------------------+--------+--+
| user_id | email | owner |
+----------+---------------------------------+--------+--+
| zhangsa | amy@only_for_json_udf_test.net | amy |
+----------+---------------------------------+--------+--+
## parse_url_tuple
SELECT b.*
from
(
select 'http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1' as urlstr
from
order_detail
limit 1
)a
LATERAL VIEW
parse_url_tuple(a.urlstr, 'HOST', 'PATH', 'QUERY', 'QUERY:k1') b as host, path, query, query_k1 LIMIT 1;
+---------------+---------------+--------------+-----------+--+
| host | path | query | query_k1 |
+---------------+---------------+--------------+-----------+--+
| facebook.com | /path1/p.php | k1=v1&k2=v2 | v1 |
+---------------+---------------+--------------+-----------+--+
空格字符串:space 重复字符串:repeat 首字符ascii:ascii
select space(10), length(space(10)), repeat('abc',5), ascii('abcde')
from order_detail
limit 1;
+-------------+------+------------------+------+--+
| _c0 | _c1 | _c2 | _c3 |
+-------------+------+------------------+------+--+
| | 10 | abcabcabcabcabc | 97 |
+-------------+------+------------------+------+--+
左补足函数:lpad 右补足函数:rpad
语法: lpad(string str, int len, string pad)
说明:lpad将 str 进行用 pad 进行左补足到 len 位, rpad将 str 进行用 pad 进行右补足到 len 位
注意:与 GP,ORACLE 不同; pad不能默认
select lpad('abc',10,'td'),rpad('abc',10,'td')
from order_detail
limit 1;
+-------------+-------------+--+
| _c0 | _c1 |
+-------------+-------------+--+
| tdtdtdtabc | abctdtdtdt |
+-------------+-------------+--+
分割字符串函数: split
集合查找函数: find_in_set
语法: find_in_set(string str, string strList)
说明: 返回 str 在 strlist 第一次出现的位置, strlist 是用逗号分割的字符串。如果没有找该 str 字符,则返回 0
select
split('abtcdtef','t'),
find_in_set('ab','ef,ab,de'),
find_in_set('at','ef,ab,de')
from order_detail
limit 1;
+-------------------+------+------+--+
| _c0 | _c1 | _c2 |
+-------------------+------+------+--+
| ["ab","cd","ef"] | 2 | 0 |
+-------------------+------+------+--+
## string转map:str_to_map
语法:str_to_map(text[, delimiter1, delimiter2])
说明:使用两个分隔符将文本拆分为键值对。 Delimiter1将文本分成K-V对,Delimiter2分割每个K-V对。
对于delimiter1默认分隔符是',',对于delimiter2默认分隔符是':'。
select str_to_map('aaa:11&bbb:22', '&', ':')
from order_detail limit 1;
+--------------------------+--+
| _c0 |
+--------------------------+--+
| {"bbb":"22","aaa":"11"} |
+--------------------------+--+
select str_to_map('aaa:11&bbb:22', '&', ':')['aaa']
from order_detail limit 1;
+------+--+
| _c0 |
+------+--+
| 11 |
+------+--+
select str_to_map('aaa:11,bbb:22')
from person limit 1;
{"bbb":"22","aaa":"11"}
集合统计函数
## 个数统计:count 总和统计:sum
语法: count(*), count(expr), count(DISTINCT expr[, expr_.])
说明:
count(*)统计检索出的行的个数,包括 NULL 值的行;
count(expr)返回指定字段的非空值的个数;
count(DISTINCT expr[, expr_.])返回指定字段的不同的非空值的个数
语法: sum(col), sum(DISTINCT col)
说明:
sum(col)统计结果集中 col 的相加的结果; sum(DISTINCT col)统计结果中 col 不同值
select
count(*),count(user_type),count(distinct user_type),
sum(sales),sum(distinct sales)
from order_detail;
+------+------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 |
+------+------+------+------+------+--+
| 10 | 10 | 2 | 29 | 17 |
+------+------+------+------+------+--+
## 平均值统计:avg 最小值统计:min 最大值统计:max
select
avg(sales),avg(distinct sales),min(sales),max(distinct sales)
from order_detail;
+------+------+------+------+--+
| _c0 | _c1 | _c2 | _c3 |
+------+------+------+------+--+
| 2.9 | 3.4 | 1 | 6 |
+------+------+------+------+--+
## 标准差:stddev_samp, stddev, stddev_pop
stddev_pop <==> stddev
## 方差:var_samp, var_pop
当我们需要真实的标准差/方差的时候最好是使用: stddev stddev_pop var_pop
而只是需要得到少量数据的标准差/方差的近似值可以选用: stddev_samp var_samp
select
var_pop(sales),var_samp(sales),
stddev_pop(sales),stddev(sales),stddev_samp(sales),
pow(stddev_pop(sales),2),pow(stddev_samp(sales),2)
from order_detail;
+-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
| _c0 | _c1 | _c2 | _c3 | _c4 | _c5 | _c6 |
+-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
| 3.09 | 3.433333333333333 | 1.7578395831246945 | 1.7578395831246945 | 1.8529256146249728 | 3.0899999999999994 | 3.4333333333333336 |
+-------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--+
## 百分位数: percentile 近似百分位数: percentile_approx 直方图: histogram_numeric
语法: percentile_approx(DOUBLE col, p [, B])
返回值: double
说明: 求近似的第 pth 个百分位数, p 必须介于 0 和 1 之间,返回类型为 double,但是col 字段支持浮点类型。
参数 B 控制内存消耗的近似精度, B越大,结果的准确度越高。默认为 10,000。
当 col 字段中的 distinct 值的个数小于 B 时,结果为准确的百分位数
select
percentile(sales,0.2),
percentile_approx(sales,0.5,10000),
histogram_numeric(sales,5)
from order_detail;
+------+------+----------------------------------------------------------------------------------------------+--+
| _c0 | _c1 | _c2 |
+------+------+----------------------------------------------------------------------------------------------+--+
| 1.0 | 2.0 | [{"x":1.0,"y":3.0},{"x":2.0,"y":2.0},{"x":3.0,"y":2.0},{"x":5.0,"y":2.0},{"x":6.0,"y":1.0}] |
+------+------+----------------------------------------------------------------------------------------------+--+
select
inline(histogram_numeric(sales,10))
from order_detail;
+------+------+--+
| x | y |
+------+------+--+
| 1.0 | 3.0 |
| 2.0 | 2.0 |
| 3.0 | 2.0 |
| 5.0 | 2.0 |
| 6.0 | 1.0 |
+------+------+--+
### 后面可以输入多个百分位数,返回类型也为 array<double>,其中为对应的百分位数。
select
percentile(sales,array(0.2,0.4,0.6)),
percentile_approx(sales,array(0.2,0.4,0.6),10000)
from order_detail;
+----------------+----------------+--+
| _c0 | _c1 |
+----------------+----------------+--+
| [1.0,2.0,3.0] | [1.0,1.5,2.5] |
+----------------+----------------+--+
复杂类型访问操作及统计函数
测试数据集:
tony 1338 hello,woddd 1,2 a1,a2,a3 k1:1.0,k2:2.0,k3:3.0 s1,s2,s3,4
mark 5453 kke,ladyg 2,3 a4,a5,a6 k4:4.0,k5:5.0,k2:6.0 s4,s5,s6,6
ivyfd 4323 aa,thq,dsx 3,6 a7,a8,a9 k7:7.0,k8:8.0,k2:9.0 s7,s8,s9,9
drop table employees;
create external table if not exists employees(
name string,
salary string,
happy_word string,
happy_num array<int>,
subordinates array<string>,
deductions map<string,float>,
address struct<street:string,city:string,state:string,zip:int>
)
row format delimited fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':'
lines terminated by '\n'
stored as textfile;
hdfs dfs -put /home/liguodong/data/muldata.txt /temp/lgd
load data inpath '/temp/lgd/muldata.txt' overwrite into table employees;
select * from employees;
Getting log thread is interrupted, since query is done!
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
| name | salary | happy_word | happy_num | subordinates | deductions | address |
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
| tony | 1338 | hello,woddd | [1,2] | ["a1","a2","a3"] | {"k1":1.0,"k2":2.0,"k3":3.0} | {"street":"s1","city":"s2","state":"s3","zip":4} |
| mark | 5453 | kke,ladyg | [2,3] | ["a4","a5","a6"] | {"k4":4.0,"k5":5.0,"k2":6.0} | {"street":"s4","city":"s5","state":"s6","zip":6} |
| ivyfd | 4323 | aa,thq,dsx | [3,6] | ["a7","a8","a9"] | {"k7":7.0,"k8":8.0,"k2":9.0} | {"street":"s7","city":"s8","state":"s9","zip":9} |
+--------+---------+--------------+------------+-------------------+-------------------------------+---------------------------------------------------+--+
## 访问数组 Map 结构体
select
name,salary,
subordinates[1],deductions['k2'],deductions['k3'],address.city
from employees;
+--------+---------+------+------+-------+-------+--+
| name | salary | _c2 | _c3 | _c4 | city |
+--------+---------+------+------+-------+-------+--+
| tony | 1338 | a2 | 2.0 | 3.0 | s2 |
| mark | 5453 | a5 | 6.0 | NULL | s5 |
| ivyfd | 4323 | a8 | 9.0 | NULL | s8 |
+--------+---------+------+------+-------+-------+--+
## Map类型长度 Array类型长度
select size(deductions),size(subordinates) from employees limit 1;
+------+------+--+
| _c0 | _c1 |
+------+------+--+
| 3 | 3 |
+------+------+--+
## 类型转换: cast
select cast(salary as int),cast(deductions['k2'] as bigint) from employees;
+---------+------+--+
| salary | _c1 |
+---------+------+--+
| 1338 | 2 |
| 5453 | 6 |
| 4323 | 9 |
+---------+------+--+
### LATERAL VIEW 行转列
SELECT
name, ad_subordinate
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate;
+--------+-----------------+--+
| name | ad_subordinate |
+--------+-----------------+--+
| tony | a1 |
| tony | a2 |
| tony | a3 |
| mark | a4 |
| mark | a5 |
| mark | a6 |
| ivyfd | a7 |
| ivyfd | a8 |
| ivyfd | a9 |
+--------+-----------------+--+
SELECT
name, count(1)
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate
group by name;
+--------+------+--+
| name | _c1 |
+--------+------+--+
| ivyfd | 3 |
| mark | 3 |
| tony | 3 |
+--------+------+--+
SELECT ad_subordinate, ad_num
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate
LATERAL VIEW explode(happy_num) addTable2 AS ad_num;
+-----------------+---------+--+
| ad_subordinate | ad_num |
+-----------------+---------+--+
| a1 | 1 |
| a1 | 2 |
| a2 | 1 |
| a2 | 2 |
| a3 | 1 |
| a3 | 2 |
| a4 | 2 |
| a4 | 3 |
| a5 | 2 |
| a5 | 3 |
| a6 | 2 |
| a6 | 3 |
| a7 | 3 |
| a7 | 6 |
| a8 | 3 |
| a8 | 6 |
| a9 | 3 |
| a9 | 6 |
+-----------------+---------+--+
### 多个LATERAL VIEW
SELECT
name, count(1)
FROM employees
LATERAL VIEW explode(subordinates) addTable AS ad_subordinate
LATERAL VIEW explode(happy_num) addTable2 AS ad_num
group by name;
+--------+------+--+
| name | _c1 |
+--------+------+--+
| ivyfd | 6 |
| mark | 6 |
| tony | 6 |
+--------+------+--+
### 不满足条件产生空行
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW
explode(array()) BB AS a limit 10;
+-------+----+--+
| name | a |
+-------+----+--+
+-------+----+--+
### OUTER 避免永远不产生结果,无满足条件的行,在该列会产生NULL值。
SELECT AA.name, BB.* FROM employees AA
LATERAL VIEW
OUTER explode(array()) BB AS a limit 10;
+--------+-------+--+
| name | a |
+--------+-------+--+
| tony | NULL |
| mark | NULL |
| ivyfd | NULL |
+--------+-------+--+
### 字符串切分成多列
SELECT
name, word
FROM employees
LATERAL VIEW explode(split(happy_word,',')) addTable AS word;
+--------+--------+--+
| name | word |
+--------+--------+--+
| tony | hello |
| tony | woddd |
| mark | kke |
| mark | ladyg |
| ivyfd | aa |
| ivyfd | thq |
| ivyfd | dsx |
+--------+--------+--+
转载:https://blog.csdn.net/scgaliguodong123_/article/details/60881166