背景:对敏感数据脱敏其实也就是加密,用的是mysql,hive的md5加密函数,现在需要提供一个UDF函数给应用方查询,换句话说就是用明文来查询密文数据里面的其他对应字段数据
一、编写UDF函数
1、构建Java Maven项目
IDEA步骤: File -》 New -》Project... -》Next -》填写项目名称
2、pom.xml 文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>hive_udf</artifactId>
<version>1.0</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hive.version>2.1.1</hive.version>
<java.version>1.8</java.version>
<hadoop.version>2.7.3</hadoop.version>
<maven.compiler.source>${java.version}</maven.compiler.source>
<maven.compiler.target>${java.version}</maven.compiler.target>
<log4j.version>2.12.1</log4j.version>
</properties>
<repositories>
<repository>
<id>apache.snapshots</id>
<name>Apache Development Snapshot Repository</name>
<url>https://repository.apache.org/content/repositories/snapshots/</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<!--添加Hadoop的依赖-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!--添加hive依赖-->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- Java Compiler -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
<!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.1.1</version>
<executions>
<!-- Run shade goal on package phase -->
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<excludes>
<exclude>org.apache.flink:force-shading</exclude>
<exclude>com.google.code.findbugs:jsr305</exclude>
<exclude>org.slf4j:*</exclude>
<exclude>log4j:*</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<!-- Do not copy the signatures in the META-INF folder.
Otherwise, this might cause SecurityExceptions when using the JAR. -->
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.xiaoe.WarehouseStreaming</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<!-- This improves the out-of-the-box experience in Eclipse by resolving some warnings. -->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<versionRange>[3.1.1,)</versionRange>
<goals>
<goal>shade</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore/>
</action>
</pluginExecution>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<versionRange>[3.1,)</versionRange>
<goals>
<goal>testCompile</goal>
<goal>compile</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore/>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
3、UDF函数代码如下:
package com.example.deciphering;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
public class decipheringUDF extends UDF {
public static String md5_prefix = "com.example_";
public String evaluate(String input){
return toMd5(input);
}
public static String toMd5(String info) {
info= md5_prefix+info;
byte[] secretByte;
try {
secretByte = MessageDigest.getInstance("md5")
.digest(info.getBytes());
} catch ( NoSuchAlgorithmException e) {
throw new RuntimeException("找不到md5算法");
}
StringBuilder md5Code = new StringBuilder(new BigInteger(1, secretByte).toString(16));
for (int i = 0; i < 32 - md5Code.length(); i++) {
md5Code.insert(0, "0");
}
return md5Code.toString();
}
public static void main(String[] args) {
String test = toMd5("test");
System.out.println(test);
}
}
4、打成jar包,将jar包上传到服务器
二、在Hive中创建函数
1、add jar
(注:如果是已经把jar包放到$HIVE_HOME/lib 下,则可以跳过该步骤)
add jar /home/hadoop/testjar/hive_udf-1.0.jar
2、创建临时函数,只对当前窗口有效,退出后无效
CREATE TEMPORARY FUNCTION tt AS 'com.xxx.xxx';
select tt('test'); //测试
#删除临时函数
DROP TEMPORARY FUNCTION IF EXISTS tt;
3、创建永久函数,对所有窗口有效
3.1 将包上传到hdfs 上
hdfs dfs -put /home/hadoop/testjar/hive_udf-1.0.jar /usr/hive/UDF/
3.2 创建永久函数
CREATE FUNCTION tt AS 'com.xxx.xxx';
3.3 链接mysql ,在metadata里面查找是否已经存在(注意:函数跟DB库绑定)
select * from FUNCS;
3.4 切换不同的黑窗口测试,均可使用,在beeline 和 Hue中不可以
###尝试Reload FUNCTION
RELOAD FUNCTIONS;
3.5 以上不可以的话可以试试重启Hive,
三、UDF函数使用教程
1、单个值匹配
db_test.test 为已经加密的表
select * from db_test.test where statdate=20201001 and id = decipher('123456') ;
2、多个值匹配
db_test.test 为已经加密的表
SELECT*FROM db_test.test WHERE statdate='20201001' AND id IN (
SELECT DECIPHER (tt.*) FROM (
SELECT t.*FROM (
SELECT '123456' UNION
SELECT '456789') t) tt);
四、代码项目包:
链接:https://pan.baidu.com/s/1TY2m8LpXhyDwlrwnlnBN-A
提取码:1qro