1.Hive functions show函数显示
[cevent@hadoop207 hive-1.2.1]$ bin/beeline
Beeline version 1.2.1 by Apache Hive
beeline> !connect jdbc:hive2://hadoop207.cevent.com:10000
Connecting to jdbc:hive2://hadoop207.cevent.com:10000
Enter username for jdbc:hive2://hadoop207.cevent.com:10000: cevent
Enter password for jdbc:hive2://hadoop207.cevent.com:10000:
******
Connected to: Apache Hive (version 1.2.1)
Driver: Hive JDBC (version 1.2.1)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://hadoop207.cevent.com:10000> show functions; 显示hive自带函数包
+-------------------------+--+
|
tab_name |
+-------------------------+--+
| !
|
| !=
|
| %
|
| & |
| *
|
| + |
| -
|
| /
|
| <
|
| <=
|
| <=> |
| <> |
| =
|
| ==
|
| >
|
| >=
|
| ^
|
| abs
|
| acos
|
| add_months
|
| and
|
| array
|
| array_contains
|
| ascii
|
| asin
|
| assert_true
|
| atan
|
| avg |
| base64
|
| between
|
| bin
|
| case
|
| cbrt
|
| ceil
|
| ceiling
|
| coalesce
|
| collect_list
|
| collect_set
|
| compute_stats
|
| concat
|
| concat_ws
|
| context_ngrams
|
| conv
|
| corr
|
| cos
|
| count |
| covar_pop
|
| covar_samp
|
| create_union
|
| cume_dist
|
| current_database
|
| current_date
|
| current_timestamp
|
| current_user
|
| date_add |
| date_format
|
| date_sub
|
| datediff
|
| day
|
| dayofmonth
|
| decode
|
| degrees
|
| dense_rank
|
| div
|
| e
|
| elt
|
| encode
|
| ewah_bitmap
|
| ewah_bitmap_and
|
| ewah_bitmap_empty
|
| ewah_bitmap_or
|
| exp
|
| explode
|
| factorial
|
| field
|
| find_in_set
|
| first_value
|
| floor
|
| format_number
|
| from_unixtime
|
| from_utc_timestamp
|
| get_json_object
|
| greatest
|
| hash
|
| hex
|
| histogram_numeric
|
| hour
|
| if |
| in
|
| in_file
|
| index
|
| initcap
|
| inline
|
| instr
|
| isnotnull
|
| isnull
|
| java_method
|
| json_tuple
|
| lag
|
| last_day
|
+-------------------------+--+
|
tab_name |
+-------------------------+--+
| last_value
|
| lcase
|
| lead
|
| least
|
| length
|
| levenshtein
|
| like |
| ln
|
| locate
|
| log
|
| log10
|
| log2
|
| lower
|
| lpad
|
| ltrim
|
| map |
| map_keys
|
| map_values
|
| matchpath
|
| max
|
| min |
| minute
|
| month
|
| months_between
|
| named_struct |
| negative
|
| next_day
|
| ngrams
|
| noop
|
| noopstreaming
|
| noopwithmap
|
| noopwithmapstreaming
|
| not
|
| ntile
|
| nvl
|
| or
|
| parse_url
|
| parse_url_tuple
|
| percent_rank
|
| percentile
|
| percentile_approx
|
| pi
|
| pmod
|
| posexplode
|
| positive
|
| pow
|
| power
|
| printf |
| radians
|
| rand
|
| rank
|
| reflect
|
| reflect2
|
| regexp
|
| regexp_extract
|
| regexp_replace
|
| repeat |
| reverse
|
| rlike
|
| round
|
| row_number
|
| rpad |
| rtrim
|
| second
|
| sentences
|
| shiftleft |
| shiftright
|
| shiftrightunsigned
|
| sign
|
| sin
|
| size |
| sort_array
|
| soundex
|
| space
|
| split |
| sqrt
|
| stack
|
| std
|
| stddev
|
| stddev_pop |
| stddev_samp
|
| str_to_map
|
| struct
|
| substr
|
| substring
|
| sum
|
| tan
|
| to_date
|
| to_unix_timestamp
|
| to_utc_timestamp
|
| translate
|
| trim
|
| trunc
|
| ucase
|
| unbase64
|
| unhex
|
| unix_timestamp
|
| upper
|
| var_pop
|
| var_samp
|
+-------------------------+--+
|
tab_name |
+-------------------------+--+
| variance
|
| weekofyear
|
| when
|
| windowingtablefunction
|
| xpath
|
| xpath_boolean
|
| xpath_double
|
| xpath_float
|
| xpath_int
|
| xpath_long
|
| xpath_number
|
| xpath_short
|
| xpath_string
|
| year
|
| |
|
| ~
|
+-------------------------+--+
216 rows selected (0.538 seconds)
0: jdbc:hive2://hadoop207.cevent.com:10000> desc function stack; 栈函数解析
+-------------------------------------------------------------------+--+
| tab_name |
+-------------------------------------------------------------------+--+
| stack(n, cols...) - turns k columns into n rows of size
k/n each |
+-------------------------------------------------------------------+--+
1 row selected (0.028 seconds)
2.进入idea配置hadoop-hive
1
2.
3.
4.
3.导入POM依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.cevent</groupId>
<artifactId>hadoop_hive_userdefinedfunc_plugin</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- 导入POM依赖:https://mvnrepository.com/artifact/org.apache.hive/hive-exec
-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
</dependencies>
</project>
点击import change
maven仓库链接:
-
https://mvnrepository.com/
-
https://mvnrepository.com/?cf_chl_jschl_tk=c8ab2f2496743ee5687eafee5f4367d2a60c7beb-1591278001-0-AcF1mcHeS_z0Xk3Crd6GBxBqwvZNt6sK7SVRVJTrNCWPIwz7DXjiNNFqjomEWdymGibN3nh664hbLuF4IF3OrP1X4p_8bZVN0AUv2OaYJAiU9EQKgm52gMHSEODroeXYsgu89VKLwututAO-oimGgVLLJ3xN9z0KGVJlOpYhM8AwOSyepw7Lqi127oLZdLOXFwekmdvUNvdDQ10YkXmOASAJqttkTpSSybR38Uojx6kOWjMJKfRdEb2jmT-_DIC40tEh2vsnQcl6c7PVrfVIrnDKKBLZJDRiKRWrXfrl6L7QZhXu6jjZy6KF8OcSu9iMXDSbCkj9-yB40BE3Ze7bneE
搜索hadoop的aggragator聚合器
修改idea光标:file→settings
搜索editor,选择appearance,取消勾选块脱字符use block caret
设置创建类的自动注释
必须点击enable live templates
配置如下
/**
* Created by Cevent on ${DATE} ${TIME}.
*/
添加文件内的注释模板
#if (${PACKAGE_NAME}
&& ${PACKAGE_NAME} !=
"")package ${PACKAGE_NAME};#end
#parse("File Header.java")
/**
* @author cevent
* @description ${DESCRIPTION}
* @date ${DATE} ${TIME}
*/
public class ${NAME} {
}
呼出maven project
打包jar文件lifecycle-package
出现target目录
创建类
package com.cevent.hadoop.hive;
/**
* Created by Cevent on 2020/6/4.
*/
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* @author cevent
* @description 用户自定义函数
* hive的evaluate评估函数支持重载
*/
public class CeventUserDefinedFunction extends UDF{
//返回字符串/数字的长度,字符串类型:函数必须启用evaluate名,否则hive调不出
public int evaluate(String lines){
if(lines==null){
return 0;
}else return lines.length();
}
//数字类型:
public int evaluate(Number out){
if(out==null){
return 0;
}else return out.toString().length();
}
//布尔类型:
public int evaluate(Boolean flag){
if(flag==null){
return 0;
}else return flag.toString().length();
}
}
4.准备存储文件路径
5.配置YUM
遇到bug(Cannot retrieve repository metadata repomd.xml),需要修改yum.repos.d中CevntOS-Base.repo配置
原版YUM
[cevent@hadoop207 yum]$ cd /etc/yum.repos.d/
[cevent@hadoop207 yum.repos.d]$ ll
总用量 28
-rw-r--r--. 1 root root 1881 3月 14 10:02 CentOS-Base.repo
-rw-r--r--. 1 root root 1991 5月 19 2016 CentOS-Base.repo.bak
-rw-r--r--. 1 root root 647 5月 19 2016
CentOS-Debuginfo.repo
-rw-r--r--. 1 root root 289 5月 19 2016
CentOS-fasttrack.repo
-rw-r--r--. 1 root root 630 5月 19 2016 CentOS-Media.repo
-rw-r--r--. 1 root root 6259 5月 19 2016 CentOS-Vault.repo
[cevent@hadoop207 yum.repos.d]$ vim CentOS-Base.repo
# CentOS-Base.repo
#
# The mirror system uses the connecting
IP address of the client and the
# update status of each mirror to pick
mirrors that are updated to and
# geographically close to the
client. You should use this for CentOS
updates
# unless you are manually picking other
mirrors.
#
# If the mirrorlist= does not work for
you, as a fall back you can try the
# remarked out baseurl= line instead.
#
#
[base]
name=CentOS-local
baseurl=file:///mnt/cdrom
#mirrorlist=file:///mnt/cdrom
gpgcheck=1
enable=1
gpgkey=http:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-6
#released updates
[updates]
name=CentOS-$releasever - Updates
baseurl=http://mirrors.163.com/centos/$releasever/updates/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=updates
gpgcheck=1
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#additional packages that may be useful
[extras]
name=CentOS-$releasever - Extras -
163.com
baseurl=http://mirrors.163.com/centos/$releasever/extras/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=extras
gpgcheck=1
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#additional packages that extend functionality
of existing packages
[centosplus]
name=CentOS-$releasever - Plus - 163.com
baseurl=http://mirrors.163.com/centos/$releasever/centosplus/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=centosplus
gpgcheck=1
enabled=0
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#contrib - packages by Centos Users
[contrib]
name=CentOS-$releasever - Contrib -
163.com
baseurl=http://mirrors.163.com/centos/$releasever/contrib/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=contrib
gpgcheck=1
enabled=0
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
~
修改版YUM
# CentOS-Base.repo
#
# The mirror system uses the connecting
IP address of the client and the
# update status of each mirror to pick
mirrors that are updated to and
# geographically close to the
client. You should use this for CentOS
updates
# unless you are manually picking other
mirrors.
#
# If the mirrorlist= does not work for
you, as a fall back you can try the
# remarked out baseurl= line instead.
#
#
[base]
name=CentOS-$releasever - Base - 163.com
baseurl=http://mirrors.163.com/centos/$releasever/os/$basearch/
#mirrorlist=file:///mnt/cdrom
gpgcheck=1
enable=1
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#released updates
[updates]
name=CentOS-$releasever
- Updates - 163.com
baseurl=http://mirrors.163.com/centos/$releasever/updates/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=updates
gpgcheck=1
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#additional packages that may be useful
[extras]
name=CentOS-$releasever - Extras -
163.com
baseurl=http://mirrors.163.com/centos/$releasever/extras/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=extras
gpgcheck=1
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#additional packages that extend
functionality of existing packages
[centosplus]
name=CentOS-$releasever - Plus - 163.com
baseurl=http://mirrors.163.com/centos/$releasever/centosplus/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=centosplus
gpgcheck=1
enabled=0
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
#contrib - packages by Centos Users
[contrib]
name=CentOS-$releasever - Contrib -
163.com
baseurl=http://mirrors.163.com/centos/$releasever/contrib/$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=$releasever&arch=$basearch&repo=contrib
gpgcheck=1
enabled=0
gpgkey=http://mirror.centos.org/centos/RPM-GPG-KEY-CentOS-6
~
6.清洗yum,建立yum缓存
[cevent@hadoop207 yum.repos.d]$ yum clean all
已加载插件:fastestmirror, refresh-packagekit, security
Cleaning repos: base extras updates
清理一切
Cleaning up list of fastest mirrors
[cevent@hadoop207 yum.repos.d]$ yum makecache
已加载插件:fastestmirror, refresh-packagekit, security
Determining fastest mirrors
base
| 3.7 kB 00:00
base/group_gz | 242 kB
00:00
base/filelists_db | 6.4 MB 00:01
base/primary_db
| 4.7 MB 00:01
base/other_db |
2.8 MB 00:00
extras
| 3.4 kB 00:00
extras/prestodelta
| 2.2 kB 00:00
extras/other_db | 14 kB
00:00
updates
| 3.4 kB 00:00
updates/filelists_db |
6.9 MB 00:05
updates/prestodelta
| 318 kB 00:00
updates/primary_db
| 9.9 MB 00:02
updates/other_db | 415
kB 00:00
元数据缓存已建立
7.安装yum install -y lrzsz
[cevent@hadoop207 yum.repos.d]$ cd /opt/module/datas/
[cevent@hadoop207 datas]$ ll
总用量 18624
-rw-rw-r--. 1 cevent cevent 147 5月 10 13:46 510test.txt
-rw-rw-r--. 1 cevent cevent 266 5月 17 13:52 business.txt
-rw-rw-r--. 1 cevent cevent 129 5月 17 13:52 constellation.txt
-rw-rw-r--. 1 cevent cevent 71 5月 17 13:52 dept.txt
-rw-rw-r--. 1 cevent cevent 78 5月 17 13:52 emp_sex.txt
-rw-rw-r--. 1 cevent cevent 656 5月 17 13:52 emp.txt
drwxrwxr-x. 4 cevent cevent 4096 5月 22 13:32 export
-rw-rw-r--. 1 cevent cevent 2794 6月 4 22:32
hadoop_hive_userdefinedfunc_plugin-1.0-SNAPSHOT.jar
-rw-rw-r--. 1 cevent cevent 37 5月 17 13:52 location.txt
-rw-rw-r--. 1 cevent cevent 19014993 5月 17 13:52 log.data
-rw-rw-r--. 1 cevent cevent 136 5月 17 13:52 movie.txt
-rw-rw-r--. 1 cevent cevent 213 5月 17 13:52 score.txt
-rw-rw-r--. 1 cevent cevent 165 5月 17 13:52 student.txt
-rw-rw-r--. 1 cevent cevent 301 5月 17 13:52 数据说明.txt
[cevent@hadoop207 datas]$ sudo yum install -y lrzsz
安装处理jar包工具
已加载插件:fastestmirror, refresh-packagekit, security
设置安装进程
Loading mirror speeds from cached
hostfile
base
| 3.7 kB 00:00
base/primary_db | 4.7 MB 00:01
extras
|
3.4 kB 00:00
updates
| 3.4 kB 00:00
updates/primary_db | 9.9 MB 00:02
解决依赖关系
--> 执行事务检查
---> Package lrzsz.x86_64
0:0.12.20-27.1.el6 will be 安装
--> 完成依赖关系计算
依赖关系解决
======================================================================================================
软件包 架构 版本 仓库 大小
======================================================================================================
正在安装:
lrzsz
x86_64 0.12.20-27.1.el6 base 71 k
事务概要
======================================================================================================
Install 1 Package(s)
总下载量:71 k
Installed size: 159 k
下载软件包:
lrzsz-0.12.20-27.1.el6.x86_64.rpm | 71 kB
00:00
运行 rpm_check_debug
执行事务测试
事务测试成功
执行事务
Warning: RPMDB altered outside of yum.
** Found 3 pre-existing rpmdb problem(s),
'yum check' output follows:
2:postfix-2.6.6-6.el6_7.1.x86_64 has
missing requires of libmysqlclient.so.16()(64bit)
2:postfix-2.6.6-6.el6_7.1.x86_64 has
missing requires of libmysqlclient.so.16(libmysqlclient_16)(64bit)
2:postfix-2.6.6-6.el6_7.1.x86_64 has
missing requires of mysql-libs
正在安装 : lrzsz-0.12.20-27.1.el6.x86_64 1/1
Verifying : lrzsz-0.12.20-27.1.el6.x86_64
1/1
已安装:
lrzsz.x86_64 0:0.12.20-27.1.el6
完毕!
[cevent@hadoop207 datas]$ ll
总用量 18624
-rw-rw-r--. 1 cevent cevent 147 5月 10 13:46 510test.txt
-rw-rw-r--. 1 cevent cevent 266 5月 17 13:52 business.txt
-rw-rw-r--. 1 cevent cevent 129 5月 17 13:52 constellation.txt
-rw-rw-r--. 1 cevent cevent 71 5月 17 13:52 dept.txt
-rw-rw-r--. 1 cevent cevent 78 5月 17 13:52 emp_sex.txt
-rw-rw-r--. 1 cevent cevent 656 5月 17 13:52 emp.txt
drwxrwxr-x. 4 cevent cevent 4096 5月 22 13:32 export
-rw-rw-r--. 1 cevent cevent
2794 6月 4 22:32
hadoop_hive_userdefinedfunc_plugin-1.0-SNAPSHOT.jar
-rw-rw-r--. 1 cevent cevent 37 5月 17 13:52 location.txt
-rw-rw-r--. 1 cevent cevent 19014993 5月 17 13:52 log.data
-rw-rw-r--. 1 cevent cevent 136 5月 17 13:52 movie.txt
-rw-rw-r--. 1 cevent cevent 213 5月 17 13:52 score.txt
-rw-rw-r--. 1 cevent cevent 165 5月 17 13:52 student.txt
-rw-rw-r--. 1 cevent cevent 301 5月 17 13:52 数据说明.txt
8.将jar包添加到hive的classpath(此方法为临时添加,除非放在hive的library)
0: jdbc:hive2://hadoop207.cevent.com:10000>
将jar包add入当前beeline进程
add jar
/opt/module/datas/hadoop_hive_userdefinedfunc_plugin-1.0-SNAPSHOT.jar;
INFO
: Added
[/opt/module/datas/hadoop_hive_userdefinedfunc_plugin-1.0-SNAPSHOT.jar] to
class path
INFO
: Added resources: [/opt/module/datas/hadoop_hive_userdefinedfunc_plugin-1.0-SNAPSHOT.jar]
No rows affected (0.076 seconds)
0:
jdbc:hive2://hadoop207.cevent.com:10000> use
cevent01;
No rows affected (0.046 seconds)
0:
jdbc:hive2://hadoop207.cevent.com:10000> 创建自定义函数UDF,as后必须是jar包中class全名
create function
ceventLength as "com.cevent.hadoop.hive.CeventUserDefinedFunction";
No rows affected (0.058 seconds)
0:
jdbc:hive2://hadoop207.cevent.com:10000> select
*,ceventLength(comment) from c_emp;
根据UDF查询comment的字符位数
+--------------+--------------+-------------+----------------+-----------------+---------------+----------------+--------------+------+--+
| c_emp.empno |
c_emp.ename | c_emp.jobs | c_emp.manager | c_emp.hiredata | c_emp.salary | c_emp.comment | c_emp.depno | _c1
|
+--------------+--------------+-------------+----------------+-----------------+---------------+----------------+--------------+------+--+
| NULL | cevent | NULL | NULL | NULL | NULL | NULL | 619 | 0
|
| 7369 | SMITH | CLERK | 7902 | 1980-12-17 | 800.0 | NULL | 20 | 0 |
| 7499 | ALLEN | SALESMAN | 7698 | 1981-2-20 | 1600.0 | 300.0
| 30 | 5 |
| 7521 | WARD | SALESMAN | 7698 | 1981-2-22 | 1250.0 | 500.0 | 30 | 5 |
| 7566 | JONES | MANAGER | 7839 | 1981-4-2 | 2975.0 | NULL | 20 | 0 |
| 7654 | MARTIN | SALESMAN | 7698 | 1981-9-28 | 1250.0 | 1400.0 | 30 | 6 |
| 7698 | BLAKE | MANAGER | 7839 | 1981-5-1 | 2850.0 | NULL | 30 | 0 |
| 7782 | CLARK | MANAGER | 7839 | 1981-6-9 | 2450.0 | NULL | 10 | 0
|
| 7788 | SCOTT | ANALYST | 7566 | 1987-4-19 | 3000.0 | NULL | 20 | 0 |
| 7839 | KING | PRESIDENT | NULL | 1981-11-17 | 5000.0 | NULL | 10 | 0 |
| 7844 | TURNER | SALESMAN |
7698 | 1981-9-8 | 1500.0 | 0.0 | 30 | 3 |
| 7876 | ADAMS | CLERK | 7788 | 1987-5-23 | 1100.0 | NULL | 20 | 0 |
| 7900 | JAMES | CLERK | 7698 | 1981-12-3 | 950.0 | NULL | 30 | 0 |
| 7902 | FORD | ANALYST | 7566 | 1981-12-3 | 3000.0 | NULL | 20 | 0 |
| 7934 | MILLER | CLERK | 7782 | 1982-1-23 | 1300.0 | NULL | 10 | 0 |
+--------------+--------------+-------------+----------------+-----------------+---------------+----------------+--------------+------+--+
15 rows selected (0.142 seconds)
9.创建UDF解析
//调用idea编译的jar包,functuon 自定义类名 as 'jar包类的全名'
create temporary function
ceventLength as 'com.cevent.hadoop.hive.CeventUserDefinedFunction';
//加入temporary,表示重启就消失,这里取消temporary临时函数
create function ceventLength as
'com.cevent.hadoop.hive.CeventUserDefinedFunction';