hive函数

最新推荐文章于 2024-08-07 23:49:36 发布

SXF2410

最新推荐文章于 2024-08-07 23:49:36 发布

阅读量223

点赞数

文章标签： hive

本文链接：https://blog.csdn.net/Shixuefeng2410/article/details/124446959

版权

hive函数

#UDF(单进单出)
#UDTF聚合函数（多进单出）
#UDAF（多进多出）

wc 命令让我们可以计算文件的 Byte 数、字数、或是列数
wc [-clw][--help][--version][文件...]
参数
-c , --bytes , --chars 只显示Bytes数
-l , --lines 只显示列数
-w , --words 只显示字数
--help 在线帮助
--version 显示版本信息

[root@localhost ~]# wc demo1.txt
7 92 607 demo1.txt # demo1.txt 文件的行数为3、单词数92、字节数598
wc -l hive-default.xml

#数学函数
   https://blog.csdn.net/weixin_47391932/article/details/107307452
   double log(double base ,double a)                #底数为base的a的对数double
   pow(double base, double p)                        #底数为base的p的幂值double
   conv(bigint/string num,int from_base,int to_base)#将数值num从from_base禁制转为to_base禁制string
       long E=0xabcd;
   pmod(int/double a,int/double b) #求a余b的结果int/double
   [a]sin(double/decimal a) #求a的正反正弦值double
   [a]cos(double/decimal a ) #求a的正反余弦值double
   [a]tan(double/decimal a ) #求a的正反正切值double
   degree(double/decimal a ) #将弧度a转为角度double
   radians(double/decimal a) #将角度a转为弧度double
   positive(int/double a) #返回a int/double
   negative(int/double a) #返回a的相反数 int/double
   sign(double/deciaml a ) #返回a 的符号
   e()/pi()                   #E PI double
   greatest(T...vs)               #返回最大值（参数只能列举，不支持集合）
   least(T...vs)               #返回最小值
   bround(double a, int b)       #double 财务舍入法四舍六入五考虑，五后非零就进一，五后为零看奇偶，五前为偶应舍去，五前为奇要进一
   factorial(int a )        #int 20 以内阶乘最大只能到20
   shiftright(int a,int b)       #位右移
   shiftleft(int a, int b)       #位左移

#类型转换 1+14+6+31+13+3+5+6
   #hive原子类型支持向上隐式转换（类似于多态）
       #如实际类型为tinyint。实际需要int。会自动转换
       #但实际类型为int，实际需要为tinyint，则会报错
   type cast(strVal as <type>) #将字符串转换为其他类型
       #select cast('12.7'as int);=》12
#集合函数
   int           size(Map<K,V>/Array<T>)
       #select size(stuhobby),size(stuscore)from student_ext;

   Array<K>   map_keys(Map<K,V>)
       #select map_keys(stuscore) from student_ext;
   Array<K>   map_values(Map<K,V>)
       #select map_values(stuscore) from student_ext;
   boolean       array_contains(Array<T>,T)   #contains_key contains_value
       #select array_contains(array('eat','sing'),'eat');
       #select array_contains(stuhobby,'eat') from student_ext;
       #select * from student_ext where array_contains(stuhobby,'sing');
   Array<T>    sort_array(Array<T>) #升序       #sort_map_key sort_map_value
       #select stuname, sort_array(map_values(stuscore))[size(stuscore)-1] from student_ext;求分数最大值，也就是最后一位下标
   Array<T>    array(T...t)               #where...int
       #select array('s','d','f');
   Map<K,V>   map(K,k1,V v1...)
   #select map('henery',18,'pola',20,'ariel',7);
   Struct<...>   struct(p1,v1,p2,v2...)
   #select struct('province','js','city','nj').co12=》js
   #select B.col2 from(select struct('province','js','city','nj') B) A;=》js

#日期函数
   string        from_unixtime(bigint time, string time_format)
       #select from_unixtime(164566778,'yyyy-MM-dd');
   string        date_format(date/timestamp/string date.string format)
       #select date_format('2022-1-22 3:44:12 444','yyyy/MM/dd');
   string       to_date(string timestamp)
       #select to_date('2021-01-22');
   string        current_date()
       #select current_date();
   string        current_timestamp()
       #select current_timestamp();
   bigint        unix_timestamp()
       #1675687897
   bigint       unix_timestamp(string datetime,string format_pattern)
       #select unix_timestamp('2021-6-12','yyyy-MM-dd');
   string        date_add(string date,int days) date_sub(string date,int days)
       #select date_add(current_date(),2);
   string        add_months(string date,int numberofmonths,string output_date_format)
       #select add_months(current_date(),1);
       #select add_months(current_date(),1,'yyyy=MM');
   string        next_day(string date, string dayOfWeek)
       #select next_day(current_date(),'mo');       下一个星期几 MO,TU,WE,TH,FR,SA,SU
   string       last_day(string date)
       #select last_day('2020-2-12');       当前日期月份的最后一天
   string        trunc(string date,string format)
       #select trunc(current_date(),'YY');               本年第一天
       #select trunc(current_date(),'Q');               本季第一天
       #select trunc(current_date(),'MM');               本月第一天
       #select date_add(trunc(current_date(),'mo'),-7);本周第一天
   int           datediff(string datefrom,string dateto)
       #select datediff(current_date(),'2021-5-5');
   double        months_between(string datefrom,string dateto)
       #select months_between('2021-6-12','2020-10-10')；
   year(date/string dt)....
   dayofweek(date/string dt)

   #条件函数
   T        if(boolean,T vtrue,T vfalse)
   T        nvl(T value,T default)
           #select nvl('HELLO','DEFAULT');
           #SELECT NVL(null,'DEFAULT');
   T       coalesce(T...vs)
           #select coalesce(null,aa,null,33); 返回第一个非null值，单行中多列操作
   T       case a when b then... when c then ...else...end
   T       case when a b then... when c then ...else...end
   boolean isnull(a)
   boolean   isnotnull(a)

#字符串函数
   int    ascii(string a)
           #select ascii('hello');       返回字符串首字母ASCII码
   string    concat_ws(string sep,array<string>/string...array)
           #select concat_ws(',',hobbies) from tabtype;
           -----------------------------------------------------


           -------------------------------------------------------
   array<array<string>> sentences(string sentences)
           #select sentences('hello there . How are you?');
   #按连续N个单词出现的频次，倒序取Top K
   array<struct<string,double>> ngrams(array<array<string>> arr,int n,int k)
       #select ngram(sentences('Hello are there. Hello are Hello are you?'),1,2); @1
       #select B[0].ngram,B[0].estfrequency from(
       #select ngrams(sentences('hello. how are you? fine! thank you! and you?'),1,2) B
       #) A
+----------------------------------------------------+
| _c0 |
+----------------------------------------------------+
| [{"ngram":["you"],"estfrequency":3.0},{"ngram":["how"],"estfrequency":1.0}] |
+----------------------------------------------------+

   array<struct<string,double>> context_ngrams(array<array<string>> arr,array<string>,int k)
       #select context_ngram(sentences('Hello are there. Hello are Hello are you?'),array(null,null),2); 等同@1
       #select context_ngram(sentences('Hello are there. Hello are Hello are you?'),array('hello',null),2);   #和hello一起出现，而且在hello之后出现的词频排名前2的词

   #'US-ASCII' 'ISO-8859-1','UTF-8','UTF-16BE','UTF-16LE','UTF-16'
   binary   encode(string source,string charset)
       #select encode('我爱中华001abc你们','utf-8');
       #select encode('我爱中华001abc你们','ISO-8859-1');
       #select decode(encode('我爱中华001abc你们','ISO-8859-1'),'UTF-8');

   string    format_number(decimal num,int d)
       #类似bround
   string get_json_object(string json,string path)
       #select get_json_object(
       '{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.name');
       #select get_json_object(
       '{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.hobbies[0]');
       #select get_json_object(
       '{"name":"henry","hobbies":["s","a","c"],"address":{"province":"js","city":"nj"}}','$.address.city');

   boolean    in_file(string line,string path)   #path指向的文件中是否包含line内容
   int           instr(string,substring)               #从string中找到substring中首字母位置（1~）
       #select instr('hello world are','hello');
       #select instr('hello world are','world');
   int        locate(subString, string[,pos])       #从string[pos位置开始]中找到substring中首字母位置（1~）
       #select locate('world','hello world are');
       #select locate('world','hello world are',8);

   #HOST,PATH,QUERY,REF,PROTOCOL,AUTHORITY,FILE,and USERINFO
   string parse_url(string urlString,string part[,string key])   #parse_url(url,'QUERY','key')
       #select parse_url('https://pages.tmall.com/wow/z/lianmeng/default/route-lianmeng-index?wh_alimama=true&disableNav=YES&es=JtY2%2FNeCVhQN%2BoQUE6FNzAGXmIayN1PAVLsT5%2B%2BeDotjSfgTdn9YoQXyHRE9zKSgs%2FhBCF9ZJgI%3D&ali_trackid=2:mm_1183900030_1813100136_110529050082:1641898323_171_1844465833&union_lens=lensId:OPT@1641614531@21043e97_07e1_17e37d9d9fd_4fb6@01;eventPageId:20150318020008790;recoveryid:1641898323_171_1844465833&bxsign=tbkvUIpLIPktSBlxRjR2gprwXMFz0XmIBxC1AkF8HTRFcDeteGHHToCvdrS2b2uf6veUQ6RSH4w2EQux7YBqOlKHEr6ttssunXt5MReoJI8hnA=','HOST');

   # %s %d %.nf
   string    printf(string format,T...t)
   boolean like # %
   boolean rlike # [] {} ? + * +d \w ...
           #rlike '^regexp$'
   string    regexp_replace(string src,string regex,string replacement)
       #select regexp_replace('a12bbcad12abcdf','[a-z]\\d{2}','XXX');
   string   regex_extract(string src,string regex,int index)
       #select regex_extract('namehenryokdalingduck','name:(.*?),gender(.*?)$',3);
   array<string> split(string src,string regex) #分割产生以为数组
       #select split("ash,ous,dhfj,ks,ksjghk!kh,ds",',');=》["ash","ous","dhfj","ks","ksjghk!kh","ds"]
   sting translate(string src, string chars,string dchars)
       #select translate('axbyc','xy','#Y'); =>a#bYc
   string initcap(string src)
       #select initcap('henry'); =>Heney 首字母大写
   string   substr(string src,int begin[,int len])
   string substring_index(string src,string sep,int count)
       #select substr('henry.chen@qq.com',7);=chen@qq.com
       #select substr('henry.chen@qq.com',12,2);=>qq
       # select sentences('henry.chen@qq.com',7)[0][1];
       #select split(sentences('henry.chen@qq.com',7)[0][1],'\\.')[0];
       #select substring_index(substring_index('henry.chen@qq.com','@',-1),'.',1);
       #select substring('henry.chen@qq.com',locate('@','henry.chen@qq.com')+1,2);
       #select substring_index(substring( 'henry.chen@qq.com',locate('@','henry.chen@qq.com')+1),'.',1);

   #非对称加密
   srting md5(string src)
       #select md5('kb16'); // => 2ed0188588ab9a11c2ba86e2ba1f25dd
       #select md5(concat('abc',md5('kb16'),'abc'));

   #对称加密base64
   string   base64(binary b)
   string   unbase64(string pass)
       #select base64(cast('kb16'as binary));=>a2IxNg==
       #select unbase64('a2IxNg==');=>kb16
   binary aes_decrypt               #16为加密
       aes_encrypt
       #select base64(aes_encrypt('kb16','0123456789abcdef'));=>5veJ4u8pJ+yteijnArp1Qg==
       #select aes_decrypt(unbase64('5veJ4u8pJ+yteijnArp1Qg=='),'0123456789abcdef');=>kb16

   #字符串a的正反16进制转化
   string   hex(string a)
   string   unhex(string a)
       #select hex('kb16');=》6B623136
       #select unhex('6B623136');=》kb16
   string    soundex(string src) #首字母+3个数字       比较两个字符串之间的差异
       #select soundex('henry');=》H560
       #select soundex('heny');=》H500

   #聚合函数
   int    count(*) #all rows
   int    count(expr) #rows where expr is not null
   bigint   count(distinct expr[,expr...])#rows where all the expr is not null and unique
           /*create table hive_test(
           name string,
           age int
           );
           drop table hive_test;
           show tables;
           insert into hive_test(name,age)
           select 'henry',18 union all
           select 'henry',18 union all
           select 'henry',null union all
           select 'henry',18 union all
           select 'henry',null;
           count(*):计算每一行，count(字段)，null不计入。distinct记一个
           select count(1) from hive_test;=>5
           select count(age) from hive_test;=>3
           select count(distinct name) from hive_test;=>1*/

   double    sum(distinct expr)/avg(distinct expr)/min(col)/max(col)
   double   var_pop   #方差（离散程度）
   double    var_sample(col) #样本方差
   double   stddev_pop(col)#标椎偏差（是否存在坏值）
   double    stddev_sample(col)#样本标准偏差
   double   covar_pop(col1,col2)#协方差
   double   covar_sample(col1,col2)#样本协方差
   double   corr(col1,col2)#两列数值的相关关系
   double   percentile(bigint col,int p) #返回col的百分位数

   collect_list(col)   #和mysql中group_concat()类似，聚合，产生数组
   collect_set(col)   #自动去重，聚合，产生数组
   concat_ws(',',collect_list|set(field))# <=>group_concat in mysql
   select collect_set(stugender)
   from student_ext;
   #------------------------
   ["m","f"]
   #--------------------------
   select stugender,collect_list(stuhobby) hobbies
   from student_ext
   group by stugender;       #collect_list产生数组，在原有数组上生成二位数组
   #-----------------------------------------------------------
   m   [["sing","dance","read"],["carton","dance","read"]]
   f   [["sing","eat","read"]]
   #----------------------------------------------------------------
   select stugender,concat_ws(',', collect_list(concat_ws(',',stuhobby))) hobbies
   from student_ext group by stugender; #concat_ws去掉数组形式
   #------------------------------------------------------
   f   sing,eat,read
   m   sing,dance,read,carton,dance,read
   #---------------------------------------------------------------------------
   select stugender,split(concat_ws(',', collect_list(concat_ws(',',stuhobby))),',') hobbies
   from student_ext group by stugender;   #用split分割，生成一维数组
   #--------------------------------------------------------------------------------------------
   f   ["sing","eat","read"]
   m   ["sing","dance","read","carton","dance","read"]
   #--------------------------------------------------------------------------------------------
#表生成函数
   #侧视图 select ...from TABLE_NAME lateral view func(...) V_ALIAS as f1,...
   group by +collect_list|set=>（多行转单列）
   group by +case when =>（多行转多列）

   T/row explode(array<T>/Map<K,V>)   （单列转多行）   # 一列里多个元素炸裂成多行（炸裂array，炸裂map）
   #select stuname,stuage,stugender,hobby
   #from student_ext lateral view explode(stuhobby) T as hobby;

   #select stugender,collect_set(hobby) dist_hobby
   #from student_ext lateral view explode(stuhobby) T as hobby
   #group by stugender;       #炸裂array

   #select stuname,stuage,stugender,subject,score
   #from student_ext lateral view explode(stuscore) T as subject,score;   #炸裂map


   row posexplode(array<T>)       #炸裂array，看到数组的元素和其下标，pos 表示元素在数组中的下标
   #select stuname,stuage,stugender,pos,hobby
   #from student_ext lateral view posexplode(stuhobby) T as pos,hobby;

   #取stuhobby数组最后一个元素,(最后一个元素的下标和数组长度-1相等
   select stuname,stuage,stugender,hobby
from (
select stuname, stuage, stugender, pos, hobby, size(stuhobby) - 1 ix
from student_ext lateral view posexplode(stuhobby) T as pos, hobby
)T
where pos=ix;
   #----------------------------------------------------------------------------
   with T1 as (           #多个子查询可以with写，最内层的子查询最先写，后面可以调用前面的
select stuname, stuage, stugender, pos, hobby, size(stuhobby) - 1 ix
from student_ext lateral view posexplode(stuhobby) T as pos, hobby
   ),
       T2 as(
       ),
       T3 as(
       ),..
   select stuname,stuage,stugender,hobby
   from T1 where pos=ix;
   #------------------------------------------------------------------------------------

   row   stack(rowNum,v1,...,vm)# （多列转多行）m/n为整数，m列个数据，分成rowNum行
   size(array|map)相同   =》单列转多列
   #desc function extended stack;
   #stack(n, cols...) - turns k columns into n rows of size k/n each
   select stack(2,'A',10,date '2015-01-01','B',20,date '2016-01-01') as(a,b,c);

   tuple   json_tuple(string json,string...key)   (key,value),...           #一次性解析json格式
   #-----------------------------------------------------------------------------------------------------------------------
   select json_tuple('{"name":"henry","hobbies":["a","s","c"],"address":{"province":"js","city":"nj"}}','name','hobbies'
,'address');

   select json_tuple(content,'name','hobbies','address') from json_test;

   #---------------------------------------------------------------------------------------------------------------------
   tuple   parse_url_tuple(string url, string..parts)
   select name,hobby,province,city,protocl,host,path
   from json_test
lateral view json_tuple(json_content,'name','hobbies','address') T1 as name,hobbies,address
lateral view json_tuple(address,'province','city') T2 as province,city
lateral view explode(split(regexp_replace(hobbies,'\\[|]|"',''),',')) T3 as hobby
lateral view parse_url_tuple(url_content,'PROTOCOL','HOST','PATH') T4 as protocl,host,path;

   row       inline(array<struct<...>>)

#窗口函数：横向扩展表
#如需控制范围需要指定...over(...rows between ??? and ???)
first_value(col) : 分组内排序后截止到当前行的第一个值
last_value(col) : 分组内排序后截止到当前行的最后一个值
lag(col,n,DF) : 窗口内往前第n行col的值
lead(col,n,DF) : 窗口内往后第n行col的值
n 可选,默认为1
df 默认值,如果第n行col值为null,去DF值
   #控制粒度   如需控制范围，需指定
   #over从句
   func over(partition by field1,... order by fielda,... rows betwee ...and ...)
   func over(distribute by field1,... sort by fileda,... rows betwee ...and ...)
       over()全表
       over(partition by filed1,...) 所有的分区数据
       over(partition by field1,... order by fielda,...) 从当前分区首行到当前行；order by 全局排序,会把所有的reducer 拉到一个job中
   func over(distribute by field1,... sort by fileda,...)   sort by 当前分区排序当前reducer排序
   func over(distribute by field1,... sort by fileda,... rows|range between ... and ...)
   #rows是物理窗口，range是逻辑窗口
   #range between 按照列值限制窗口大小（在非条件表达式中等同于rows）
   #rows表示行，就是前n行，后n行
   #range表示的是具体的值，比这个值小n的行，比这个值大n的行
   unbounded preceding ... unbounded following       向前无线行，向后无限行（就是从第一行到最后一行）（有分区，以分区为边界，否则以表为边界）
   unbounded preceding ... current row （当前行）（从第一行到当前行）
   N preceding | N following    (从当前行的前N行到后N行)

   1.使用标椎的聚合函数count、sum、min、max、avg
   2、使用partition by 语句，使用一个或多个原始数据类型的列
   3、使用PARTITION BY与ORDER BY语句，使用一个或者多个数据类型的分区或者拍序列
   4、使用窗口规范，窗口规范支持一下格式：
   (ROW | RANGE) BETWEEN (UNBOUNDED | [num]) PRECEDING AND ([num] PRECEDING | CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
   (ROW | RANGE) BETWEEN CURRENT ROW AND (CURRENT ROW | (UNBOUNDED | [num]) FOLLOWING)
   (ROW | RANGE) BETWEEN [num] PRECEDING AND (UNBOUNDED | [num]) FOLLOWING
   5、当ORDER BY后面缺少窗口从句条件，窗口规范默认是RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
   6、当ORDER BY和窗口从句都缺失，窗口规范默认是：ROW BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING

   #分析函数
   row_number(); #从1开始的行序号
   rank();#从1开始的名次（并列出现空缺）1,2,2,4
   dense_rank();#从1开始的名次（并列不留空缺）1,2,2,3
   cume_dist();#小于等于当前值的行数/分组内总行数
               #比如，统计小于等于当前薪水的人数，所占总人数的比例
   percent_rank();#分组内当前行的rank值-1/分组内总行数-1
   ntile(n);#用于将分组数据按照顺序切分成n片，返回当前切片值，如果切片不均匀，默认增加第一个切片的分布
               #ntile不支持rows between
               比如， ntile(2) over(partition by cookieid order by createtime row between 3 preceding and current row)


window_func | aggr_func | analysis_func over()
窗口函数聚合函数分析函数

desc function sort_array;
select stuname,sort_array(map_values(stuscore)) from student_ext;
select stuname,sort_array(map_values(stuscore))[size(stuscore)-1] from student_ext;
select struct('java',22,'mysql',45);
select map('java',22,'mysql',45);

SXF2410

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hive函数

hive函数#UDF(单进单出)#UDTF聚合函数（多进单出）#UDAF（多进多出）wc 命令让我们可以计算文件的 Byte 数、字数、或是列数wc [-clw][--help][--version][文件...]参数-c , --bytes , --chars 只显示Bytes数-l , --lines 只显示列数-w , --words 只显示字数--help 在线帮助--version 显示版本信息[root@localhost ~]# wc demo1.txt 7 .
复制链接

扫一扫